def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])
                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])
                                    arg_nr_we_want = 3
                                    if nr_of_args < arg_nr_we_want:
                                        #print(f'func got to less args for us')
                                        break

                                    arg_two = return_type_lib.get_arg_two_name_from_function_signature(
                                        elem2[0])

                                    result = common_stuff_lib.is_type_known(
                                        arg_two)

                                    if result == False:
                                        #print(f'arg_two not a known type')
                                        pass
                                    else:
                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > 100000) or (
                                                len(dis2_str) > 100000) or (
                                                    len(dis1_str) <
                                                    1) or (len(dis2_str) < 1):
                                            print(
                                                f'dis1_str >{len(dis1_str)}<')
                                            print(
                                                f'dis2_str >{len(dis2_str)}<')
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, arg_two))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
Exemplo n.º 2
0
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    if (len(elem2[4]) >
                                        (int(config[
                                            'tokenized_disassembly_length']) /
                                         2)) or (len(att_dis) > (int(config[
                                             'tokenized_disassembly_length']) /
                                                                 2)
                                                 ) or (len(elem2[4]) < 1) or (
                                                     len(att_dis) < 1):
                                        continue

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])

                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    #                                     if return_type == 'unknown':
                                    #                                         print(f'string_before_func_name: {return_type_func_sign}')
                                    #
                                    #                                     if return_type == 'unknown':
                                    #                                         #print('unknown found')
                                    #                                         #breaker = True
                                    #                                         #break
                                    #                                         pass
                                    #                                     elif return_type == 'delete':
                                    #                                         #print('delete found')
                                    #                                         ### no return type found, so delete this item
                                    #                                         pass
                                    #                                     elif return_type == 'process_further':
                                    #                                         print(f'ERRROOOORRRR---------------')
                                    if nr_of_args == -1:
                                        print(f'Error nr_of_args')
                                    else:
                                        print(f'nr_of_args >{nr_of_args}<',
                                              end='\r')

                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)
                                        #print(f'dis1_str >{dis1_str}<')

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                 ) / 2)
                                            ) or (len(dis2_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                       ) / 2)
                                                  ) or (len(dis1_str) < 1) or (
                                                      len(dis2_str) < 1):
                                            print(
                                                f'tokenized_disassembly_length caller >{len(dis1_str)}<'
                                            )
                                            print(
                                                f'tokenized_disassembly_length callee >{len(dis2_str)}<'
                                            )
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, nr_of_args))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, func_signature
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    return_type_func_sign = return_type_lib.get_return_type_from_function_signature(
                                        elem2[0])
                                    return_type = return_type_lib.get_return_type_from_gdb_ptype(
                                        elem2[1])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    if return_type == 'unknown':
                                        print(
                                            f'string_before_func_name: {string_before_func_name}'
                                        )

                                    if return_type == 'unknown':
                                        #print('unknown found')
                                        #breaker = True
                                        #break
                                        pass
                                    elif return_type == 'delete':
                                        #print('delete found')
                                        ### no return type found, so delete this item
                                        pass
                                    elif return_type == 'process_further':
                                        print(f'ERRROOOORRRR---------------')
                                    else:

                                        dis1_str = ' '.join(att_dis)
                                        dis2_str = ' '.join(elem2[4])

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        dis_str = dis1_str + dis2_str

                                        #print(f'dis_str >{dis_str}<')

                                        dataset_list.append(
                                            (dis_str, return_type))
                                        counter += 1
                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
Exemplo n.º 4
0
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading\n')
    print()

    copy_files_to_build_dataset(config)

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ###loop through all pickle.tar.bz2 files
    for pickle_file in pickle_files:
        print(f"Untar pickle-file >{pickle_file}< to >{config['work_dir']}<")

        tarbz2_lib.untar_file_to_path(config['pickle_dir'] + pickle_file,
                                      config['work_dir'])

        ###install source-package of pickle-file-content
        pickle_file_name = os.path.basename(pickle_file)
        pickle_file_name = pickle_file_name.replace('.pickle.tar.bz2', '')

        gdb_lib.install_source_package(pickle_file_name, config)

        ###check with gdb (list cmd) if the sources are newer/older than binary
        ## warning: Source file is more recent than executable.
        ###get dir name

        dir_name = config['ubuntu_src_pkgs']
        ##dir_name = get_dirname_of_src(pickle_file_name)
        print(f'Dir with src is:{dir_name}')
        res = check_if_src_match_binary(pickle_file_name, dir_name, config)

        ##src and binary dont match, unpack the second src in the dir
        if not res:
            unpack_second_src(pickle_file_name)
            res = check_if_src_match_binary(pickle_file_name, dir_name)
            print(f'res of second src dir: {res}')
        else:
            print(f'src match binary')

        #break

        ###open the pickle
        print(
            f"Open untarred pickle file: >{config['work_dir'] + pickle_file}<")
        pickle_content = open_pickle(config['work_dir'] +
                                     pickle_file.replace('.tar.bz2', ''))

        #print(f'pickle_content >{pickle_content}<')

        #exit()

        fcn = ''
        fl = ''
        bina = ''
        gdb_func_sign = ''
        ### loop through the pickle-file and get source-code from function
        #print(f'pickle-content: {next(iter(pickle_content))}')
        for funcSign, gdb_ret_type, func_name, file_name, disas_att, disas_intel, package_name, binary in pickle_content:
            print(f'funcSign: {funcSign}')
            #print(f'gdb_ret_type: {gdb_ret_type}')
            print(f'func_name: {func_name}')
            print(f'file_name: {file_name}')
            #print(f'disas_att: {disas_att}')
            #print(f'disas_intel: {disas_intel}')
            print(f'package_name: {package_name}')
            print(f'binary: {binary}')
            fcn = func_name
            fl = file_name
            bina = binary
            gdb_func_sign = funcSign
            #break

            ### get source code of function
            pkg_name = pickle_file.replace('.pickle.tar.bz2', '')
            pkg_name = os.path.basename(pkg_name)
            print(f'pkg_name:{pkg_name}')

            #pkg_src_name = "/tmp/" + pkg_name + "/" + dir_name
            pkg_src_name = config['ubuntu_src_pkgs']

            print(f'pkg_src_name:{pkg_src_name}')

            full_path = get_full_path(pkg_src_name, fl)
            print(f'full-path:{full_path}')

            len_full_path = len(full_path)
            nr_of_empty_src_code = 0

            ### ctags does not get return-type if its located lines above func_name
            ### gdb funcSign got it, we need to check if we need more lines than ctags tells us
            for f in full_path:
                src_code = get_source_code(f, fcn, gdb_func_sign)
                if src_code:
                    print(f'src-code:{src_code}')
                else:
                    print(f'no src-code found')
                    nr_of_empty_src_code += 1

            print(
                f'nr_of_empty_src_code:{nr_of_empty_src_code}   len_full_path:{len_full_path}'
            )
            if len_full_path == nr_of_empty_src_code + 1:
                print('only found one source code, thats good')
            else:
                print('ERROR found more than one source code for a function')
                break