def test_VocabularyZipFile_EncodeLlvmBytecode_struct_dict( vocab: vocabulary.VocabularyZipFile, ): """Test that struct appears in struct_dict.""" options = inst2vec_pb2.EncodeBytecodeOptions(set_struct_dict=True) result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options) assert dict(result.struct_dict) == {"%struct.Foo": "{ i32, i32 }"}
def test_VocabularyZipFile_EncodeLlvmBytecode_preprocessing( vocab: vocabulary.VocabularyZipFile, ): """Test output of pre-processing bytecode.""" options = inst2vec_pb2.EncodeBytecodeOptions( set_bytecode_after_preprocessing=True) result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR, options) assert (result.bytecode_after_preprocessing == """\ define i32 <@ID>(i32) <%ID> = alloca i32, align 4 <%ID> = alloca i32, align 4 store i32 <%ID>, i32* <%ID>, align 4 <%ID> = load i32, i32* <%ID>, align 4 <%ID> = srem i32 <%ID>, <INT> <%ID> = icmp eq i32 <%ID>, <INT> br i1 <%ID>, label <%ID>, label <%ID> ; <label>:<LABEL>: ; preds = <LABEL> store i32 <INT>, i32* <%ID>, align 4 br label <%ID> ; <label>:<LABEL>: ; preds = <LABEL> store i32 <INT>, i32* <%ID>, align 4 br label <%ID> ; <label>:<LABEL>: ; preds = <LABEL>, <LABEL> <%ID> = load i32, i32* <%ID>, align 4 ret i32 <%ID>""")
def EncodeAndPadSourcesWithInst2Vec( df: pd.DataFrame, vocab: inst2vec_vocabulary.VocabularyZipFile, datafolder: pathlib.Path, max_sequence_len: typing.Optional[int] = None, ) -> typing.Tuple[np.array, int]: """Encode and pad source code using inst2vec translation.""" sequence_lengths = [] sequences = [] # A map from source files to encoded sequences, as there can be multiple # entries in the dataframe using the same sequence. src_path_to_sequence = {} src_paths = list( set( DataFrameRowToKernelSrcPath(row, datafolder) for _, row in df.iterrows())) # Chunk the srcs and process in parallel. srcs_per_process = 16 encode_args = [(src_paths[i:i + srcs_per_process], datafolder) for i in range(0, len(src_paths), srcs_per_process)] batches = multiprocessing.Pool().starmap(_EncodeSourceBatchOrDie, encode_args) for batch in batches: for src_file_path, bytecode in batch: app.Log(2, "Encoding %s", src_file_path.name) sequence = list(vocab.EncodeLlvmBytecode(bytecode).encoded) src_path_to_sequence[src_file_path] = sequence for _, row in df.iterrows(): src_file_path = DataFrameRowToKernelSrcPath(row, datafolder) sequence = src_path_to_sequence[src_file_path] sequence_lengths.append(len(sequence)) sequences.append(sequence) if max_sequence_len is None: max_sequence_len = max(sequence_lengths) app.Log( 2, "Sequence lengths: min=%d, avg=%.2f, max=%d", min(sequence_lengths), np.mean(sequence_lengths), max_sequence_len, ) encoded = np.array( keras_sequence.pad_sequences(sequences, maxlen=max_sequence_len, value=vocab.unknown_token_index)) encoded = np.vstack([np.expand_dims(x, axis=0) for x in encoded]) return encoded, max_sequence_len
def test_VocabularyZipFile_EncodeLlvmBytecode_encode_single_line( vocab: vocabulary.VocabularyZipFile, ): """Test encoding a single line of bytecode.""" # A single line of bytecode which references a struct. result = vocab.EncodeLlvmBytecode( "store %struct.Foo* %0, %struct.Foo** %2, align 8", options=inst2vec_pb2.EncodeBytecodeOptions( set_bytecode_after_preprocessing=True, ), struct_dict={"%struct.Foo": "{ i32, i32 }"}, ) assert (result.bytecode_after_preprocessing == "store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8")
def test_VocabularyZipFile_EncodeLlvmBytecode_struct_not_in_preprocessed( vocab: vocabulary.VocabularyZipFile, ): """Test that struct is inlined during pre-processing.""" options = inst2vec_pb2.EncodeBytecodeOptions( set_bytecode_after_preprocessing=True) result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options) assert (result.bytecode_after_preprocessing == """\ define void <@ID>({ i32, i32 }*) <%ID> = alloca { i32, i32 }*, align 8 store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8 <%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8 <%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT> <%ID> = load i32, i32* <%ID>, align 4 <%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8 <%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT> <%ID> = load i32, i32* <%ID>, align 4 <%ID> = add nsw i32 <%ID>, <%ID> store i32 <%ID>, i32* <%ID>, align 4 ret void""")
def test_VocabularyZipFile_EncodeLlvmBytecode_sequence( vocab: vocabulary.VocabularyZipFile, ): # Function contains 14 statements. result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR) assert len(result.encoded) == 14
def test_VocabularyZipFile_EncodeLlvmBytecode_bytecode( vocab: vocabulary.VocabularyZipFile, ): """Test that bytecode is set in return value.""" result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR) assert result.input_bytecode == FIZZBUZZ_IR
def CreateSeqDirFromIr(folder_ir: str, vocab: vocabulary.VocabularyZipFile) -> str: """Transform a folder of raw IR into trainable data to be used as input data in tasks. Args: folder_ir: The folder of LLVM IR to read. Must end in '_ir'. vocab: The vocabulary to use to encode IR. Returns: The path of the folder of sequences, ending in '_seq'. """ # Setup assert folder_ir, "Please specify a folder containing the raw LLVM IR" assert os.path.exists(folder_ir), "Folder not found: " + folder_ir folder_seq = re.sub("_ir$", "_seq", folder_ir) if folder_seq: app.Log(1, "Preparing to write LLVM IR index sequences to %s", folder_seq) if not os.path.exists(folder_seq): os.makedirs(folder_seq) # Get sub-folders if there are any listing = os.listdir(folder_ir + "/") folders_ir = list() folders_seq = list() found_subfolder = False for path in listing: if os.path.isdir(os.path.join(folder_ir, path)): folders_ir.append(os.path.join(folder_ir, path)) folders_seq.append(os.path.join(folder_seq, path)) found_subfolder = True if found_subfolder: app.Log(1, "Found %d subfolders", len(folders_ir)) else: app.Log(1, "No subfolders found in %s", folder_ir) folders_ir = [folder_ir] folders_seq = [folder_seq] # Loop over sub-folders for i, raw_ir_folder in enumerate(folders_ir): l = folders_seq[i] + "/" if not os.path.exists(l) or not os.listdir(l): # Read data from folder raw_data, file_names = read_data_files_from_folder(raw_ir_folder) # Write indexed sequence of statements seq_folder = folders_seq[i] if not os.path.exists(seq_folder): os.makedirs(seq_folder) # Write indexed sequence of statements to files. for i, file in enumerate(raw_data): result = vocab.EncodeLlvmBytecode(file) # Write to csv file_name_csv = os.path.join(seq_folder, file_names[i][:-3] + "_seq.csv") file_name_rec = os.path.join(seq_folder, file_names[i][:-3] + "_seq.rec") with open(file_name_csv, "w") as csv, open(file_name_rec, "wb") as rec: for ind in result.encoded: csv.write(str(ind) + "\n") rec.write(struct.pack("I", int(ind))) return folder_seq
def EncodeGraph( graph: llvm_util.LlvmControlFlowGraph, vocab: inst2vec_vocabulary.VocabularyZipFile, session: tf.compat.v1.Session, embedding_lookup_op, embedding_lookup_input_ph, ) -> llvm_util.LlvmControlFlowGraph: """Encode inst2vec attributes on an LLVM control flow graph. For every node in the graph, this adds to keys to the data dictionary: 'inst2vec_encoded' containing the index into the vocabulary of the node, and 'inst2vec' which contains the numpy embedding array. Args: graph: The graph to encode. vocab: The vocabulary to encode. embedding_matrix: The embedding matrix. Returns: The graph. """ # Encode the entire file with debugging options set. We need to process # the entire file so that we can get the struct_dict, which we will need # when encoding individual nodes. This could be made faster by simply # calling `vocab.GetStructDict(graph.graph['llvm_bytecode'].split('\n'))`, # but the extra debug information is useful. result = vocab.EncodeLlvmBytecode( graph.graph["llvm_bytecode"], inst2vec_pb2.EncodeBytecodeOptions( set_bytecode_after_preprocessing=True, set_unknown_statements=True, set_struct_dict=True, ), ) # if len(result.encoded) != graph.number_of_nodes(): # raise ValueError( # f"Encoded bytecode file contains {len(result.encoded)} statements, " # f"but full flow graph contains {graph.number_of_nodes()} nodes. The " # "two should be equal") # Protocol buffer maps aren't true dicts and have differing semantics. struct_dict = dict(result.struct_dict) # Set debug info as global graph attributes. graph.graph["num_unknown_statements"] = len(result.unknown_statements) graph.graph["struct_dict"] = struct_dict graph.graph[ "llvm_bytecode_preprocessed" ] = result.bytecode_after_preprocessing for _, data in graph.nodes(data=True): bytecode = data["text"] # Encode the node's bytecode using the struct dict we derived from the # entire file. Since this is a full-flow graph, each instruction's # bytecode is a single statement. encoded = vocab.EncodeLlvmBytecode( bytecode, struct_dict=struct_dict ).encoded if len(encoded) != 1: raise ValueError( f"Encoded line `{bytecode}` to {len(encoded)} statements" ) data["inst2vec_encoded"] = encoded[0] # Lookup the encoded value in the embedding matrix. # TODO(cec): This is a very slow way of doing it. Better would be to # collect the encoded values into an array and perform the embedding # lookup once. sequences = np.array(encoded, dtype=np.int32).reshape((1, 1)) embedding_vector = session.run( embedding_lookup_op, feed_dict={embedding_lookup_input_ph: sequences} ) data["inst2vec"] = embedding_vector[0][0] return graph