def compile_meta_block( self, blocks ): meta_id = self.meta_block_id self.meta_block_id += 1 # Create custom global dict for all blocks inside the meta block _globals = { f"blk{i}": b for i, b in enumerate( blocks ) } blk_srcs = [] for i, b in enumerate(blocks): # This is a normal update block if b in self.branchiness: blk_srcs.append( f"blk{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}" ) # This is an SCC block which has zero BR and is a loop else: blk_srcs.append( f"blk{i}() # {b.__name__}" ) gen_src = f"def meta_block{meta_id}():\n " gen_src += "\n ".join( blk_srcs ) # use custom_exec to compile the meta block _locals = {} custom_exec( py.code.Source( gen_src ).compile(), _globals, _locals ) ret = _locals[ f'meta_block{meta_id}' ] if _DEBUG: print(gen_src) # We will use pypyjit.dont_trace_here to compile standalone traces for # each meta block try: from pypyjit import dont_trace_here dont_trace_here( 0, False, ret.__code__ ) except: pass return ret
def mk_bits(nbits): assert nbits > 0, "We don't allow Bits0" # assert nbits < 512, "We don't allow bitwidth to exceed 512." if nbits not in _bits_types: custom_exec( compile(bits_template.format(nbits), filename=f"Bits{nbits}", mode="exec"), globals(), locals()) return _bits_types[nbits]
def gen_wrapped_SCCblk( s, scc, src ): # TODO mamba? scc_tick_func = SimpleTickPass.gen_tick_function( scc ) _globals = { 's': s, 'scc_tick_func': scc_tick_func, 'deepcopy': deepcopy, 'UpblkCyclicError': UpblkCyclicError } _locals = {} custom_exec(py.code.Source( src ).compile(), _globals, _locals) return _locals[ 'generated_block' ]
def gen_hook_func( top, x, ports, case_file ): port_srcs = [ f"'h{{str(x.{p}.to_bits())}}" for p in ports ] src = """ def dump_case(): if top.sim_cycle_count() > 2: # skip the 2 cycles of reset print(f"`T({});", file=case_file, flush=True) """.format( ",".join(port_srcs) ) _locals = {} custom_exec( py.code.Source(src).compile(), {'top': top, 'x': x, 'case_file': case_file}, _locals) return _locals['dump_case']
def _create_fn(fn_name, args_lst, body_lst, _globals=None): # Assemble argument string and body string args = ', '.join(args_lst) body = '\n'.join(f' {statement}' for statement in body_lst) # Assemble the source code and execute it src = f'def {fn_name}({args}):\n{body}' if _globals is None: _globals = {} _locals = {} custom_exec(py.code.Source(src).compile(), _globals, _locals) return _locals[fn_name]
def compile_scc(i): nonlocal scc_id scc = SCCs[i] if len(scc) == 1: return list(scc)[0] for x in scc: if x in onces: raise UpblkCyclicError("update_once blocks are not allowed to appear in a cycle. \n - " + \ "\n - ".join( [ f"{y.__name__} ({'@update_once' if y in onces else '@update'} " \ f"in 'top.{repr(top.get_update_block_host_component(y))[2:]}')" for y in scc] )) scc_id += 1 if _DEBUG: print(f"{'='*100}\n SCC{scc_id}\n{'='*100}") # For each non-trivial SCC, we need to figure out a intra-SCC # linear schedule that minimizes the time to re-execute this SCC # due to value changes. A bad schedule may inefficiently execute # the SCC for many times, each of which changes a few signals. # The current algorithm iteratively finds the "entry block" of # the SCC and expand its adjancent blocks. The implementation is # to first find the actual entry point, and then BFS to expand the # footprint until all nodes are visited. tmp_schedule = [] Q = deque() if scc_pred[i] is None: # We start bfs from the block that has the least number of input # edges in the SCC InD = {v: 0 for v in scc} for (u, v) in E: # u -> v if u in scc and v in scc: InD[v] += 1 Q.append(max(InD, key=InD.get)) else: # We start bfs with the blocks that are successors of the # predecessor scc in the previous SCC-level topological sort. pred = set(SCCs[scc_pred[i]]) # Sort by names for a fixed outcome for x in sorted(scc, key=lambda x: x.__name__): for v in G_T[ x]: # find reversed edges point back to pred SCC if v in pred: Q.append(x) # Perform bfs to find a heuristic schedule visited = set(Q) while Q: u = Q.popleft() tmp_schedule.append(u) for v in G[u]: if v in scc and v not in visited: Q.append(v) visited.add(v) variables = set() for (u, v) in E: # Collect all variables that triggers other blocks in the SCC if u in scc and v in scc: variables.update(constraint_objs[(u, v)]) if len(variables) == 0: raise UpblkCyclicError("There is a cyclic dependency without involving variables." "Probably a loop that involves blocks that should be update_once:\n{}"\ .format(", ".join( [ x.__name__ for x in scc] ))) # generate a loop for scc # Shunning: we just simply loop over the whole SCC block # TODO performance optimizations using Mamba techniques within a SCC block template = """ from copy import deepcopy def wrapped_SCC_{0}(): N = 0 while True: N += 1 if N > 100: raise UpblkCyclicError("Combinational loop detected at runtime in {{{4}}} after 100 iters!") {1} {3} {2} # print( "SCC block{0} is executed", N, "times" ) break generated_block = wrapped_SCC_{0} """ # clean up non-top variables if top is there. For slices of Bits # we directly use the top level wide Bits since Bits clone is # rpython code final_variables = set() for x in sorted(variables, key=repr): w = x.get_top_level_signal() if w is x: final_variables.add(x) continue # w is not x if issubclass(w._dsl.Type, Bits): if w not in final_variables: final_variables.add(w) elif is_bitstruct_class(w._dsl.Type): if w not in final_variables: final_variables.add(x) else: final_variables.add(x) # also group them by common ancestor to reduce byte code # TODO use longest-common-prefix (LCP) algorithms ... final_var_host = defaultdict(list) for x in final_variables: final_var_host[x.get_host_component()].append(x) # Then, we generate the Python code that saves variables at the # beginning of each SCC iteration and the code that checks if the # values of those variables have changed copy_srcs = [] check_srcs = [] var_id = 0 for host, var_list in final_var_host.items(): hostlen = len(repr(host)) copy_srcs.append(f"host = {host!r}") check_srcs.append(f"host = {host!r}") sub_check_srcs = [] for var in var_list: var_id += 1 subname = repr(var)[hostlen + 1:] if issubclass(var._dsl.Type, Bits): copy_srcs.append(f"t{var_id}=host.{subname}.clone()") elif is_bitstruct_class(var._dsl.Type): copy_srcs.append(f"t{var_id}=host.{subname}.clone()") else: copy_srcs.append(f"t{var_id}=deepcopy(host.{subname})") sub_check_srcs.append(f"host.{subname} != t{var_id}") check_srcs.append( f"if { ' or '.join(sub_check_srcs)}: continue") # Divide all blks into meta blocks # Branchiness factor is the bound of branchiness in a meta block. branchiness_factor = 20 branchy_block_factor = 6 num_blks = 0 # sanity check cur_meta, cur_br, cur_count = [], 0, 0 scc_schedule = [] _globals = {'s': top, 'UpblkCyclicError': UpblkCyclicError} blk_srcs = [] # If there is only 10 blocks, we directly unroll it if len(tmp_schedule) < 10: blk_srcs = [] for i, b in enumerate(tmp_schedule): blk_srcs.append( f"blk{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}" ) _globals[f"blk{i}"] = b # put it into the block's closure else: for i, blk in enumerate(tmp_schedule): # Same here. If an update block only has top-level loop, br = 0 br = 0 if self.only_loop_at_top[blk] else self.branchiness[ blk] if cur_br == 0: cur_meta.append(blk) cur_br += br cur_count += (br > 0) if cur_br >= branchiness_factor or cur_count >= branchy_block_factor: num_blks += len(cur_meta) scc_schedule.append(cur_meta) cur_meta, cur_br, cur_count = [], 0, 0 # clear else: if br == 0: # If no branchy block available, directly start a new metablock num_blks += len(cur_meta) scc_schedule.append(cur_meta) cur_meta, cur_br, cur_count = [blk], br, (br > 0) else: cur_meta.append(blk) cur_br += br cur_count += (br > 0) if cur_br + br >= branchiness_factor or cur_count + 1 >= branchy_block_factor: num_blks += len(cur_meta) scc_schedule.append(cur_meta) cur_meta, cur_br, cur_count = [], 0, 0 # clear if cur_meta: num_blks += len(cur_meta) scc_schedule.append(cur_meta) assert num_blks == len(tmp_schedule), f"Some blocks are missing during trace breaking of SCC "\ f"({num_blks} compiled, {len(tmp_schedule)} total)" blk_srcs = [] if len(scc_schedule) == 1: for i, b in enumerate(scc_schedule[-1]): blk_srcs.append( f"blk{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}" ) _globals[f"blk{i}"] = b else: # TODO we might turn all meta blocks before the last one into meta # blocks, and directly fold the last block into the main loop # for i, meta in enumerate( scc_schedule[:-1] ): # b = self.compile_meta_block( meta ) # blk_srcs.append( f"{b.__name__}()" ) # _globals[ b.__name__ ] = b # for i, b in enumerate( scc_schedule[-1] ): # blk_srcs.append( f"blk_of_last_meta{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}" ) # _globals[ f"blk_of_last_meta{i}" ] = b for i, meta in enumerate(scc_schedule): b = self.compile_meta_block(meta) blk_srcs.append(f"{b.__name__}()") _globals[b.__name__] = b scc_block_src = template.format( scc_id, "; ".join(copy_srcs), "\n ".join(check_srcs), '\n '.join(blk_srcs), ", ".join([x.__name__ for x in scc])) if _DEBUG: print(scc_block_src, "\n", "=" * 100) _locals = {} custom_exec( py.code.Source(scc_block_src).compile(), _globals, _locals) return _locals['generated_block']
def schedule_posedge_flip( self, top ): if not hasattr( top, "_sched" ): raise Exception( "Please create top._sched pass metadata namespace first!" ) # To reduce the time to compile the code and the amount of bytecode, I # use a heuristic to group signals that belong to # s.x.y.z._flip() # s.x.y.zz._flip() # becomes # x = s.x.y # x.z._flip() # x.zz._flip() hostobj_signals = defaultdict(list) for x in reversed(sorted( top._dsl.all_signals, \ key=lambda x: x.get_host_component().get_component_level() )): if x._dsl.needs_double_buffer: hostobj_signals[ x.get_host_component() ].append( x ) done = False while not done: next_hostobj_signals = defaultdict(list) done = True for x, y in hostobj_signals.items(): if len(y) > 1: next_hostobj_signals[x].extend( y ) elif x is top: next_hostobj_signals[x].extend( y ) else: x = x.get_parent_object() next_hostobj_signals[x].append( y[0] ) done = False hostobj_signals = next_hostobj_signals strs = [] for x,y in hostobj_signals.items(): if len(y) == 1: strs.append( f" {repr(y[0])}._flip()" ) elif x is top: for z in sorted(y, key=repr): strs.append(f" {repr(z)}._flip()") else: repr_x = repr(x) pos = len(repr_x) + 1 strs.append( f" x = {repr_x}" ) for z in sorted(y, key=repr): strs.append(f" x.{repr(z)[pos:]}._flip()") if not strs: def no_double_buffer(): pass top._sched.schedule_posedge_flip = [ no_double_buffer ] else: lines = ['def compile_double_buffer( s ):'] + \ [' def double_buffer():'] + \ strs + \ [' return double_buffer'] # Shunning: The reason why we replace py.code.Source with exec(compile()) + linecache # is because py.code.Source takes a full source code and divide them into # a list of lines by newline character which scales very very poorly # when the source code is huge. For some designs with 10K+ flip-flops # the performance overhead becomes huge. l = locals() custom_exec( compile( '\n'.join(lines), filename='ff_flips', mode='exec' ), globals(), l) linecache.cache['ff_flips'] = (1, None, lines, 'ff_flips') top._sched.schedule_posedge_flip = [ l['compile_double_buffer']( top ) ]
def compile_net_blk( _globals, src, writer ): _locals = {} fname = f"Net (writer is {writer!r}" custom_exec( compile( src, filename=fname, mode="exec"), _globals, _locals ) line_cache[ fname ] = (len(src), None, src.splitlines(), fname ) return list(_locals.values())[0]
def schedule_posedge_flip(self, top): if not hasattr(top, "_sched"): raise Exception( "Please create top._sched pass metadata namespace first!") # To reduce the time to compile the code and the amount of bytecode, I # use a heuristic to group signals that belong to # s.x.y.z._flip() # s.x.y.zz._flip() # becomes # x = s.x.y # x.z._flip() # x.zz._flip() hostobj_signals = defaultdict(list) for x in reversed(sorted( top._dsl.all_signals, \ key=lambda x: x.get_host_component().get_component_level() )): if x._dsl.needs_double_buffer: hostobj_signals[x.get_host_component()].append(x) done = False while not done: next_hostobj_signals = defaultdict(list) done = True for x, y in hostobj_signals.items(): if len(y) > 1: next_hostobj_signals[x].extend(y) elif x is top: next_hostobj_signals[x].extend(y) else: x = x.get_parent_object() next_hostobj_signals[x].append(y[0]) done = False hostobj_signals = next_hostobj_signals strs = [] for x, y in hostobj_signals.items(): if len(y) == 1: strs.append(f"{repr(y[0])}._flip()") elif x is top: for z in sorted(y, key=repr): strs.append(f"{repr(z)}._flip()") else: pos = len(repr(x)) + 1 strs.append(f"x = {repr(x)}") for z in sorted(y, key=repr): strs.append(f"x.{repr(z)[pos:]}._flip()") if not strs: def no_double_buffer(): pass top._sched.schedule_posedge_flip = [no_double_buffer] else: src = """ def compile_double_buffer( s ): def double_buffer(): {} return double_buffer """.format("\n ".join(strs)) import py # print(src) l = locals() custom_exec(py.code.Source(src).compile(), globals(), l) top._sched.schedule_posedge_flip = [ l['compile_double_buffer'](top) ]
def _create_assign_lambda(s, o, lamb): assert isinstance( o, Signal ), "You can only assign(//=) a lambda function to a Wire/InPort/OutPort." srcs, line = inspect.getsourcelines(lamb) src = compiled_re.sub(r'\2', ''.join(srcs)).lstrip(' ') root = ast.parse(src) assert isinstance(root, ast.Module) and len( root.body) == 1, "We only support single-statement lambda." root = root.body[0] assert isinstance(root, ast.AugAssign) and isinstance( root.op, ast.FloorDiv) # lhs, rhs = root.target, root.value # Shunning: here we need to use ast from repr(o), because root.target # can be "m.in_" in some cases where we actually know what m is but the # source code still captures "m" lhs, rhs = ast.parse( f"s{repr(o)[len(repr(s)):]}").body[0].value, root.value lhs.ctx = ast.Store() # We expect the lambda to have no argument: # {'args': [], 'vararg': None, 'kwonlyargs': [], 'kw_defaults': [], 'kwarg': None, 'defaults': []} assert isinstance( rhs, ast.Lambda ) and not rhs.args.args and rhs.args.vararg is None, \ "The lambda shouldn't contain any argument." rhs = rhs.body # Compose a new and valid function based on the lambda's lhs and rhs # Note that we don't need to add those source code of closure var # assignment to linecache. To get the matching line number in the # error message, we set the line number of update block # Shunning: bugfix: blk_name = "_lambda__{}".format( repr(o).replace(".", "_").replace("[", "_").replace("]", "_").replace(":", "_")) lambda_upblk = ast.FunctionDef( name=blk_name, args=ast.arguments(args=[], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[ ast.AugAssign(target=lhs, op=ast.MatMult(), value=rhs, lineno=2, col_offset=6) ], decorator_list=[], returns=None, lineno=1, col_offset=4, ) lambda_upblk_module = ast.Module(body=[lambda_upblk]) # Manually wrap the lambda upblk with a closure function that adds the # desired variables to the closure of `_lambda__*` # We construct AST for the following function to add free variables in the # closure of the lambda function to the closure of the generated lambda # update block. # # def closure( lambda_closure ): # <FreeVarName1> = lambda_closure[<Idx1>].cell_contents # <FreeVarName2> = lambda_closure[<Idx2>].cell_contents # ... # <FreeVarNameN> = lambda_closure[<IdxN>].cell_contents # def _lambda__<lambda_blk_name>(): # # the assignment statement appears here # return _lambda__<lambda_blk_name> new_root = ast.Module(body=[ ast.FunctionDef( name="closure", args=ast.arguments(args=[ ast.arg(arg="lambda_closure", annotation=None, lineno=1, col_offset=12) ], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[ ast.Assign( targets=[ ast.Name(id=var, ctx=ast.Store(), lineno=1 + idx, col_offset=2) ], value=ast.Attribute( value=ast.Subscript( value=ast.Name( id='lambda_closure', ctx=ast.Load(), lineno=1 + idx, col_offset=5 + len(var), ), slice=ast.Index(value=ast.Num( n=idx, lineno=1 + idx, col_offset=19 + len(var), ), ), ctx=ast.Load(), lineno=1 + idx, col_offset=5 + len(var), ), attr='cell_contents', ctx=ast.Load(), lineno=1 + idx, col_offset=5 + len(var), ), lineno=1 + idx, col_offset=2, ) for idx, var in enumerate(lamb.__code__.co_freevars) ] + [lambda_upblk] + [ ast.Return( value=ast.Name( id=blk_name, ctx=ast.Load(), lineno=4 + len(lamb.__code__.co_freevars), col_offset=9, ), lineno=4 + len(lamb.__code__.co_freevars), col_offset=2, ) ], decorator_list=[], returns=None, lineno=1, col_offset=0, ) ]) # In Python 3 we need to supply a dict as local to get the newly # compiled function from closure. # Then `closure(lamb.__closure__)` returns the lambda update block with # the correct free variables in its closure. dict_local = {} custom_exec(compile(new_root, blk_name, "exec"), lamb.__globals__, dict_local) blk = dict_local['closure'](lamb.__closure__) # Add the source code to linecache for the compiled function new_src = "def {}():\n {}\n".format(blk_name, src.replace("//=", "@=")) linecache.cache[blk_name] = (len(new_src), None, new_src.splitlines(), blk_name) ComponentLevel1._update(s, blk) # This caching here does no caching because the block name contains # the signal name intentionally to avoid conflicts. With //= it is # more possible than normal update block to have conflicts: # if param == 1: s.out //= s.in_ + 1 # else: s.out //= s.out + 100 # Here these two blocks will implicity have the same name but they # have different contents based on different param. # So the cache call here is just to reuse the existing interface to # register the AST/src of the generated block for elaborate or passes # to use. s._cache_func_meta(blk, is_update_ff=False, given=("".join(srcs), lambda_upblk_module, line, inspect.getsourcefile(lamb))) return blk
# print("[default w/o Mamba] Use Python Bits") # The action of a __slots__ declaration is limited to the class where it is defined. # As a result, subclasses will have a __dict__ unless they also define __slots__. bits_template = """ class Bits{0}(Bits): __slots__ = ( "_nbits", "_uint", "_next" ) nbits = {0} def __init__( s, v=0, *, trunc_int=False ): return super().__init__( {0}, v, trunc_int ) _bits_types[{0}] = b{0} = Bits{0} """ _bitwidths = list(range(1, 256)) + [384, 512] _bits_types = dict() custom_exec( compile("".join([bits_template.format(nbits) for nbits in _bitwidths]), filename="bits_import.py", mode="exec"), globals(), locals()) def mk_bits(nbits): assert nbits > 0, "We don't allow Bits0" # assert nbits < 512, "We don't allow bitwidth to exceed 512." if nbits not in _bits_types: custom_exec( compile(bits_template.format(nbits), filename=f"Bits{nbits}", mode="exec"), globals(), locals()) return _bits_types[nbits]
def lock_in_simulation(): top._check_called_at_elaborate_top( "lock_in_simulation" ) # Basically we want to avoid @= between elements in the same net since # we now use @=. # - First pass creates whole bunch of signals signal_object_mapping = {} Q = [ (top, top) ] while Q: current_obj, host = Q.pop() if isinstance( current_obj, list ): for i, obj in enumerate( current_obj ): if isinstance( obj, Signal ): try: value = obj.default_value() if obj._dsl.needs_double_buffer: value <<= value except Exception as e: raise type(e)(str(e) + f' happens at {obj!r}') current_obj[i] = value signal_object_mapping[ obj ] = (current_obj, i, True, value) elif isinstance( obj, Component ): Q.append( (obj, obj) ) elif isinstance( obj, (Interface, list) ): Q.append( (obj, host) ) elif isinstance( current_obj, NamedObject ): for i, obj in current_obj.__dict__.items(): if i[0] == '_': continue if isinstance( obj, Signal ): try: value = obj.default_value() if obj._dsl.needs_double_buffer: value <<= value except Exception as e: raise type(e)(str(e) + f' happens at {obj!r}') setattr( current_obj, i, value ) signal_object_mapping[obj] = (current_obj, i, False, value) elif isinstance( obj, Component ): Q.append( (obj, obj) ) elif isinstance( obj, (Interface, list) ): Q.append( (obj, host) ) # Swap all Signal objects with actual data nets = top.get_all_value_nets() # First step is to consolidate all non-slice signals in the same net # by pointing them to the same object # TODO optimize for bitstruct fields. Essentially only sliced signals # should be excluded. for writer, signals in nets: residence = None # Find the residence value if isinstance( writer, Const ) or writer.is_top_level_signal(): residence = writer else: for x in signals: if x.is_top_level_signal(): residence = x break if residence is None: continue # whole net is slice if isinstance( residence, Const ): residence_value = residence._dsl.const else: residence_value = signal_object_mapping[ residence ][-1] # Replace top-level signals in the net with residence value for x in signals: if x is not residence and x.is_top_level_signal(): # swap old value with new residence value current_obj, i, is_list, value = signal_object_mapping[ x ] signal_object_mapping[ x ] = (current_obj, i, is_list, residence_value) if is_list: current_obj[i] = residence_value else: setattr( current_obj, i, residence_value ) top._sim.signal_object_mapping = signal_object_mapping top._sim.locked_simulation = True # Add the function that checks if the Bits objects of # top-level input ports are modified. If so, it's mostly because # the top-level ports are assigned with = instead of @=. inports = [] objs = [] for x in top._dsl.all_signals: if x.is_input_value_port() and x.is_top_level_signal() and x.get_host_component() is top: inports.append( x ) objs.append( signal_object_mapping[x][-1] ) src = """ def check_top_level_inports(): {} """.format( "\n ".join([ f"assert {x} is obj{i}, 'Please use @= to assign top level InPort top.{repr(x)[2:]}'" for i, x in enumerate(inports) ]) ) _locals = {} _globals = { f"obj{i}" : x for i, x in enumerate(objs) } _globals['s'] = top custom_exec( py.code.Source(src).compile(), _globals, _locals) top._sim.check_top_level_inports = _locals['check_top_level_inports']