def __init__(self, *args, **kwargs): super(OpenClEncoder, self).__init__(*args, **kwargs) # We start with an empty vocabulary and build it from inputs. self.lexer = lexers.Lexer(type=lexers.LexerType.OPENCL, vocabulary={}, max_encoded_length=100000) # Map relpath -> src. df = make_devmap_dataset.MakeGpuDataFrame( opencl_device_mapping_dataset.OpenClDeviceMappingsDataset().df, "amd_tahiti_7970", ) relpath_to_src = { row["relpath"]: row["program:opencl_src"] for _, row in df.iterrows() } # Map relpath -> bytecode ID. with self.ir_db.Session() as session: relpath_to_id = { row.relpath: row.id for row in session.query( ir_database.IntermediateRepresentation.id, ir_database.IntermediateRepresentation.relpath, ).filter( ir_database.IntermediateRepresentation.source_language == ir_database.SourceLanguage.OPENCL, ir_database.IntermediateRepresentation. compilation_succeeded == True, ir_database.IntermediateRepresentation.source == "pact17_opencl_devmap", ir_database.IntermediateRepresentation.relpath.in_( relpath_to_src.keys()), ) } not_found = set(relpath_to_src.keys()) - set(relpath_to_id.keys()) if not_found: raise OSError( f"{humanize.Plural(len(not_found), 'OpenCL relpath')} not" " found in IR database") # Encode the OpenCL sources. sorted_id_src_pairs: List[Tuple[int, str]] = { (relpath_to_id[relpath], relpath_to_src[relpath]) for relpath in sorted(relpath_to_src.keys()) } sorted_encodeds: List[np.array] = self.lexer.Lex( [src for id, src in sorted_id_src_pairs]) self._max_encoded_length = max( len(encoded) for encoded in sorted_encodeds) # Map id -> encoded. self.id_to_encoded = { id: encoded for (id, _), encoded in zip(sorted_id_src_pairs, sorted_encodeds) }
def lexer( lexer_type: lexers.LexerType, initial_vocab: Dict[str, int], max_chunk_size: int, ) -> lexers.Lexer: """A test fixture which returns a lexer.""" return lexers.Lexer( type=lexer_type, initial_vocab=initial_vocab, max_chunk_size=max_chunk_size )
def __init__(self, *args, **kwargs): super(LlvmEncoder, self).__init__(*args, **kwargs) # Load the vocabulary used for encoding LLVM bytecode. with open(LLVM_VOCAB) as f: data_to_load = json.load(f) vocab = data_to_load["vocab"] self._max_encoded_length = data_to_load["max_encoded_length"] self.lexer = lexers.Lexer(type=lexers.LexerType.LLVM, initial_vocab=vocab)
def lexer( lexer_type: lexers.LexerType, vocabulary: Dict[str, int], max_encoded_length: int, ) -> lexers.Lexer: """A test fixture which returns a lexer.""" return lexers.Lexer( type=lexer_type, vocabulary=vocabulary, max_encoded_length=max_encoded_length, )