Пример #1
0
 def prepareInput(self, input, device):
     preprocessed_input, _ = inst2vec_preprocess.preprocess([[input]])
     struct_dict = inst2vec_vocab.GetStructDict(preprocessed_input[0])
     preprocessed = inst2vec_vocab.PreprocessLlvmBytecode(preprocessed_input[0],struct_dict)
     #print(input)
     vocab_id = self.dictionary.get(preprocessed[0], self.dictionary["!UNK"])
     output = self.embeddings[vocab_id]
     #print(output)
     #return (vocab_id!=self.dictionary["!UNK"])
     return torch.tensor(output, dtype=torch.float, device=device).view(1,1,-1)
Пример #2
0
 def prepareInput(self, input, useClosest=False):
     preprocessed_input, _ = inst2vec_preprocess.preprocess([[input]])
     struct_dict = inst2vec_vocab.GetStructDict(preprocessed_input[0])
     preprocessed = inst2vec_vocab.PreprocessLlvmBytecode(
         preprocessed_input[0], struct_dict)
     #print(input)
     k = preprocessed[0]
     if useClosest:
         if k not in self.dictionary.keys():
             k = list(
                 sorted(list(self.dictionary.keys()),
                        key=lambda x: textdistance.jaccard.distance(
                            x, preprocessed[0])))[0]
             self.dictionary[preprocessed[0]] = self.dictionary[k]
     vocab_id = self.dictionary.get(k, self.dictionary["!UNK"])
     output = self.embeddings[vocab_id]
     return (vocab_id != self.dictionary["!UNK"], preprocessed[0], k)
Пример #3
0
def llvm_ir_to_trainable(folder_ir):

    ####################################################################################################################
    # Setup

    assert len(
        folder_ir) > 0, "Please specify a folder containing the raw LLVM IR"
    assert os.path.exists(folder_ir), "Folder not found: " + folder_ir
    folder_seq = re.sub('ir', 'seq', folder_ir)
    if len(folder_seq) > 0:
        print('Preparing to write LLVM IR index sequences to', folder_seq)
        if not os.path.exists(folder_seq):
            os.makedirs(folder_seq)

    # Get sub-folders if there are any
    listing = os.listdir(folder_ir + '/')
    folders_ir = list()
    folders_seq = list()
    found_subfolder = False
    for path in listing:
        if os.path.isdir(os.path.join(folder_ir, path)):
            folders_ir.append(os.path.join(folder_ir, path))
            folders_seq.append(os.path.join(folder_seq, path))
            found_subfolder = True
    if found_subfolder:
        print('Found', len(folders_ir), 'subfolders')
    else:
        print('No subfolders found in', folder_ir)
        folders_ir = [folder_ir]
        folders_seq = [folder_seq]

    # Loop over sub-folders
    summary = ''
    num_folders = len(folders_ir)
    for i, raw_ir_folder in enumerate(folders_ir):

        l = folders_seq[i] + '/'
        if not os.path.exists(l) or len(os.listdir(l)) == 0:
            ############################################################################################################
            # Read files

            # Read data from folder
            print('\n--- Read data from folder ', raw_ir_folder)
            raw_data, file_names = i2v_prep.read_data_files_from_folder(
                raw_ir_folder)

            # Print data statistics and release memory
            source_data_list, source_data = i2v_prep.data_statistics(
                raw_data, descr="reading data from source files")
            del source_data_list

            # Source code transformation: simple pre-processing
            print('\n--- Pre-process code')
            preprocessed_data, functions_declared_in_files = i2v_prep.preprocess(
                raw_data)
            preprocessed_data_with_structure_def = raw_data

            ############################################################################################################
            # Load vocabulary and cut off statements

            # Vocabulary files
            folder_vocabulary = FLAGS.vocabulary_dir
            dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle')
            cutoff_stmts_pickle = os.path.join(folder_vocabulary,
                                               'cutoff_stmts_pickle')

            # Load dictionary and cutoff statements
            print('\tLoading dictionary from file', dictionary_pickle)
            with open(dictionary_pickle, 'rb') as f:
                dictionary = pickle.load(f)
            print('\tLoading cut off statements from file',
                  cutoff_stmts_pickle)
            with open(cutoff_stmts_pickle, 'rb') as f:
                stmts_cut_off = pickle.load(f)
            stmts_cut_off = set(stmts_cut_off)

            ############################################################################################################
            # IR processing (inline structures, abstract statements)

            # Source code transformation: inline structure types
            print('\n--- Inline structure types')
            processed_data, structures_dictionary = inline_struct_types_txt(
                preprocessed_data, preprocessed_data_with_structure_def)

            # Source code transformation: identifier processing (abstract statements)
            print('\n--- Abstract statements from identifiers')
            processed_data = abstract_statements_from_identifiers_txt(
                processed_data)

            ############################################################################################################
            # Write indexed sequence of statements
            seq_folder = folders_seq[i]
            if not os.path.exists(seq_folder):
                os.makedirs(seq_folder)

            # Write indexed sequence of statements to file
            unknown_counter_folder = list()
            seq_length_folder = list()
            file_counter = 0
            for file in processed_data:

                stmt_indexed = list()  # Construct indexed sequence
                unknown_counter = 0  # Reset unknown counter
                for stmt in file:

                    # check whether this is a label, in which case we ignore it
                    if re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)',
                                stmt):
                        continue

                    # check whether this is an unknown
                    if stmt in stmts_cut_off:
                        stmt = rgx.unknown_token
                        unknown_counter += 1

                    # lookup and add to list
                    if stmt not in dictionary.keys():
                        stmt = rgx.unknown_token
                        unknown_counter += 1

                    stmt_indexed.append(dictionary[stmt])

                # Write to csv
                file_name_csv = os.path.join(
                    seq_folder, file_names[file_counter][:-3] + '_seq.csv')
                file_name_rec = os.path.join(
                    seq_folder, file_names[file_counter][:-3] + '_seq.rec')
                with open(file_name_csv,
                          'w') as csv, open(file_name_rec, 'wb') as rec:
                    for ind in stmt_indexed:
                        csv.write(str(ind) + '\n')
                        rec.write(struct.pack('I', int(ind)))

                # Increment counter
                unknown_counter_folder.append(unknown_counter)
                seq_length_folder.append(len(stmt_indexed))
                file_counter += 1

            # Print stats
            out = '\n\nFolder: ' + raw_ir_folder + '(' + str(i) + '/' + str(
                num_folders) + ')'
            out += '\n\nNumber of files processed: ' + str(
                len(seq_length_folder))
            out += '\n--- Sequence length stats:'
            out += '\nMin seq length    : {}'.format(min(seq_length_folder))
            out += '\nMax seq length    : {}'.format(max(seq_length_folder))
            out += '\nAvg seq length    : {}'.format(
                sum(seq_length_folder) / len(seq_length_folder))
            out += '\nTotal number stmts: {}'.format(sum(seq_length_folder))
            out += '\n--- UNK count stats:'
            out += '\nMin #UNKS in a sequence  : {}'.format(
                min(unknown_counter_folder))
            out += '\nMax #UNKS in a sequence  : {}'.format(
                max(unknown_counter_folder))
            out += '\nAvg #UNKS in a sequence  : {}'.format(
                sum(unknown_counter_folder) / len(unknown_counter_folder))
            out += '\nSum #UNKS in all sequence: {} / {}, {}%'.format(
                sum(unknown_counter_folder), sum(seq_length_folder),
                sum(unknown_counter_folder) * 100 / sum(seq_length_folder))
            print(out)
            summary += '\n' + out

    # When all is done, print a summary:
    print(summary)
    return folder_seq