Пример #1
def check_schedule( top, schedule, V, E, in_degree ):

  if len(schedule) != len(V):
    V_leftovers = {  v for v in V if in_degree[v]  }
    E_leftovers = {  (x,y) for (x,y) in E
                         if x in V_leftovers and y in V_leftovers  }
    dump_dag( top, V_leftovers, E_leftovers )

    raise UpblkCyclicError( """
Update blocks have cyclic dependencies.
* Please consult update dependency graph for details.
Пример #2
        def compile_scc(i):
            nonlocal scc_id

            scc = SCCs[i]

            if len(scc) == 1:
                return list(scc)[0]

            for x in scc:
                if x in onces:
                    raise UpblkCyclicError("update_once blocks are not allowed to appear in a cycle. \n - " + \
                                    "\n - ".join( [
                                      f"{y.__name__} ({'@update_once' if y in onces else '@update'} " \
                                      f"in 'top.{repr(top.get_update_block_host_component(y))[2:]}')"
                                      for y in scc] ))

            scc_id += 1
            if _DEBUG: print(f"{'='*100}\n SCC{scc_id}\n{'='*100}")

            # For each non-trivial SCC, we need to figure out a intra-SCC
            # linear schedule that minimizes the time to re-execute this SCC
            # due to value changes. A bad schedule may inefficiently execute
            # the SCC for many times, each of which changes a few signals.
            # The current algorithm iteratively finds the "entry block" of
            # the SCC and expand its adjancent blocks. The implementation is
            # to first find the actual entry point, and then BFS to expand the
            # footprint until all nodes are visited.

            tmp_schedule = []
            Q = deque()

            if scc_pred[i] is None:
                # We start bfs from the block that has the least number of input
                # edges in the SCC
                InD = {v: 0 for v in scc}
                for (u, v) in E:  # u -> v
                    if u in scc and v in scc:
                        InD[v] += 1
                Q.append(max(InD, key=InD.get))

                # We start bfs with the blocks that are successors of the
                # predecessor scc in the previous SCC-level topological sort.
                pred = set(SCCs[scc_pred[i]])
                # Sort by names for a fixed outcome
                for x in sorted(scc, key=lambda x: x.__name__):
                    for v in G_T[
                            x]:  # find reversed edges point back to pred SCC
                        if v in pred:

            # Perform bfs to find a heuristic schedule
            visited = set(Q)
            while Q:
                u = Q.popleft()
                for v in G[u]:
                    if v in scc and v not in visited:

            variables = set()
            for (u, v) in E:
                # Collect all variables that triggers other blocks in the SCC
                if u in scc and v in scc:
                    variables.update(constraint_objs[(u, v)])

            if len(variables) == 0:
                raise UpblkCyclicError("There is a cyclic dependency without involving variables."
                                "Probably a loop that involves blocks that should be update_once:\n{}"\
                                .format(", ".join( [ x.__name__ for x in scc] )))

            # generate a loop for scc
            # Shunning: we just simply loop over the whole SCC block
            # TODO performance optimizations using Mamba techniques within a SCC block

            template = """
from copy import deepcopy
def wrapped_SCC_{0}():
  N = 0
  while True:
    N += 1
    if N > 100:
      raise UpblkCyclicError("Combinational loop detected at runtime in {{{4}}} after 100 iters!")
    # print( "SCC block{0} is executed", N, "times" )
generated_block = wrapped_SCC_{0}

            # clean up non-top variables if top is there. For slices of Bits
            # we directly use the top level wide Bits since Bits clone is
            # rpython code

            final_variables = set()

            for x in sorted(variables, key=repr):
                w = x.get_top_level_signal()
                if w is x:

                # w is not x
                if issubclass(w._dsl.Type, Bits):
                    if w not in final_variables:
                elif is_bitstruct_class(w._dsl.Type):
                    if w not in final_variables:

            # also group them by common ancestor to reduce byte code
            # TODO use longest-common-prefix (LCP) algorithms ...

            final_var_host = defaultdict(list)
            for x in final_variables:

            # Then, we generate the Python code that saves variables at the
            # beginning of each SCC iteration and the code that checks if the
            # values of those variables have changed
            copy_srcs = []
            check_srcs = []

            var_id = 0
            for host, var_list in final_var_host.items():
                hostlen = len(repr(host))

                copy_srcs.append(f"host = {host!r}")
                check_srcs.append(f"host = {host!r}")

                sub_check_srcs = []

                for var in var_list:
                    var_id += 1
                    subname = repr(var)[hostlen + 1:]
                    if issubclass(var._dsl.Type, Bits):
                    elif is_bitstruct_class(var._dsl.Type):

                    sub_check_srcs.append(f"host.{subname} != t{var_id}")

                    f"if { ' or '.join(sub_check_srcs)}: continue")

            # Divide all blks into meta blocks
            # Branchiness factor is the bound of branchiness in a meta block.
            branchiness_factor = 20
            branchy_block_factor = 6

            num_blks = 0  # sanity check
            cur_meta, cur_br, cur_count = [], 0, 0
            scc_schedule = []

            _globals = {'s': top, 'UpblkCyclicError': UpblkCyclicError}
            blk_srcs = []

            # If there is only 10 blocks, we directly unroll it
            if len(tmp_schedule) < 10:
                blk_srcs = []
                for i, b in enumerate(tmp_schedule):
                        f"blk{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}"
                    _globals[f"blk{i}"] = b  # put it into the block's closure

                for i, blk in enumerate(tmp_schedule):
                    # Same here. If an update block only has top-level loop, br = 0
                    br = 0 if self.only_loop_at_top[blk] else self.branchiness[
                    if cur_br == 0:
                        cur_br += br
                        cur_count += (br > 0)
                        if cur_br >= branchiness_factor or cur_count >= branchy_block_factor:
                            num_blks += len(cur_meta)
                            cur_meta, cur_br, cur_count = [], 0, 0  # clear
                        if br == 0:
                            # If no branchy block available, directly start a new metablock
                            num_blks += len(cur_meta)
                            cur_meta, cur_br, cur_count = [blk], br, (br > 0)
                            cur_br += br
                            cur_count += (br > 0)

                            if cur_br + br >= branchiness_factor or cur_count + 1 >= branchy_block_factor:
                                num_blks += len(cur_meta)
                                cur_meta, cur_br, cur_count = [], 0, 0  # clear

                if cur_meta:
                    num_blks += len(cur_meta)

                assert num_blks == len(tmp_schedule), f"Some blocks are missing during trace breaking of SCC "\
                                                      f"({num_blks} compiled, {len(tmp_schedule)} total)"

                blk_srcs = []

                if len(scc_schedule) == 1:
                    for i, b in enumerate(scc_schedule[-1]):
                            f"blk{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}"
                        _globals[f"blk{i}"] = b

                    # TODO we might turn all meta blocks before the last one into meta
                    # blocks, and directly fold the last block into the main loop
                    # for i, meta in enumerate( scc_schedule[:-1] ):
                    # b = self.compile_meta_block( meta )
                    # blk_srcs.append( f"{b.__name__}()" )
                    # _globals[ b.__name__ ] = b

                    # for i, b in enumerate( scc_schedule[-1] ):
                    # blk_srcs.append( f"blk_of_last_meta{i}() # [br {self.branchiness[b]}, loop {int(self.only_loop_at_top[b])}] {b.__name__}" )
                    # _globals[ f"blk_of_last_meta{i}" ] = b

                    for i, meta in enumerate(scc_schedule):
                        b = self.compile_meta_block(meta)
                        _globals[b.__name__] = b

            scc_block_src = template.format(
                scc_id, "; ".join(copy_srcs), "\n    ".join(check_srcs),
                '\n    '.join(blk_srcs), ", ".join([x.__name__ for x in scc]))

            if _DEBUG: print(scc_block_src, "\n", "=" * 100)

            _locals = {}
                py.code.Source(scc_block_src).compile(), _globals, _locals)
            return _locals['generated_block']
Пример #3
def simple_sim_pass(s, seed=0xdeadbeef):
    assert isinstance(s, ComponentLevel1)

    if not hasattr(s._dsl, "all_U_U_constraints"):
        raise NotElaboratedError()

    placeholders = [
        x for x in s._dsl.all_named_objects if isinstance(x, Placeholder)

    if placeholders:
        raise LeftoverPlaceholderError(placeholders)

    all_upblks = set(s._dsl.all_upblks)
    expl_constraints = set(s._dsl.all_U_U_constraints)

    gen_upblk_reads = {}
    gen_upblk_writes = {}

    if isinstance(s, ComponentLevel2):
        all_update_ff = set(s._dsl.all_update_ff)

        if isinstance(s, ComponentLevel3):
            nets = s.get_all_value_nets()

            for writer, signals in nets:
                if len(signals) == 1: continue
                readers = [x for x in signals if x is not writer]

                fanout = len(readers)

                upblk_name = f"{writer!r}__{fanout}" \
                                .replace( ".", "_" ).replace( ":", "_" ) \
                                .replace( "[", "_" ).replace( "]", "" ) \
                                .replace( "(", "_" ).replace( ")", "" ) \
                                .replace( ",", "_" )

                rstrs = [f"{x!r} @= _w" for x in readers
                         ]  # THIS IS SLOW, NOW WE CAN HAVE BETTER MECHANISM
                _globals = {'s': s}

                if isinstance(writer, Const) and type(
                        writer._dsl.const) is not int:
                    types = get_bitstruct_inst_all_classes(writer._dsl.const)

                    for t in types:
                        if t.__name__ in _globals:
                            assert t is _globals[
                                __name__], "Cannot handle two subfields with the same struct name but different structs"
                        _globals[t.__name__] = t

                src = f"""
        def {upblk_name}():
          _w = {writer!r}
          {"; ".join(rstrs)}
                _locals = {}
                exec(py.code.Source(src).compile(), _globals, _locals)

                _recent_blk = _locals[upblk_name]

                # Collect read/writer metadata, directly insert them into _all_X

                gen_upblk_reads[_recent_blk] = [writer]
                gen_upblk_writes[_recent_blk] = readers

        # s is ComponentLevel2

        # Explicit constraint
        # Schedule U1 before U2 when U1 == WR(x) < RD(x) == U2: combinational
        # Explicitly, one should define these to invert the implicit constraint:
        # - RD(x) < U when U == WR(x) --> RD(x) ( == U') < U == WR(x)
        # - WR(x) > U when U == RD(x) --> RD(x) == U < WR(x) ( == U')
        # constraint RD(x) < U1 & U2 reads  x --> U2 == RD(x) <  U1
        # constraint RD(x) > U1 & U2 reads  x --> U1 <  RD(x) == U2 # impl
        # constraint WR(x) < U1 & U2 writes x --> U2 == WR(x) <  U1 # impl
        # constraint WR(x) > U1 & U2 writes x --> U1 <  WR(x) == U2
        # Doesn't work for nested data struct and slice:

        read_upblks = defaultdict(set)
        write_upblks = defaultdict(set)

        for data in [s._dsl.all_upblk_reads, gen_upblk_reads]:
            for blk, reads in data.items():
                for rd in reads:

        for data in [s._dsl.all_upblk_writes, gen_upblk_writes]:
            for blk, writes in data.items():
                for wr in writes:

        for typ in ['rd', 'wr']:  # deduplicate code
            if typ == 'rd':
                constraints = s._dsl.all_RD_U_constraints
                equal_blks = read_upblks
                constraints = s._dsl.all_WR_U_constraints
                equal_blks = write_upblks

            # enumerate variable objects
            for obj, constrained_blks in constraints.items():

                # enumerate upblks that has a constraint with x
                for (sign, co_blk) in constrained_blks:

                    for eq_blk in equal_blks[
                            obj]:  # blocks that are U == RD(x)
                        if co_blk != eq_blk:
                            if sign == 1:  # RD/WR(x) < U is 1, RD/WR(x) > U is -1
                                # eq_blk == RD/WR(x) < co_blk
                                expl_constraints.add((eq_blk, co_blk))
                                # co_blk < RD/WR(x) == eq_blk
                                expl_constraints.add((co_blk, eq_blk))

        # Implicit constraint
        # Synthesize total constraints between two upblks that read/write to
        # the "same variable" (we also handle the read/write of a recursively
        # nested field/slice)
        # Implicitly, WR(x) < RD(x), so when U1 writes X and U2 reads x
        # - U1 == WR(x) & U2 == RD(x) --> U1 == WR(x) < RD(x) == U2

        impl_constraints = set()

        # Collect all objs that write the variable whose id is "read"
        # 1) RD A.b.b     - WR A.b.b, A.b, A
        # 2) RD A.b[1:10] - WR A.b[1:10], A.b, A
        # 3) RD A.b[1:10] - WR A.b[0:5], A.b[6], A.b[8:11]

        for obj, rd_blks in read_upblks.items():
            writers = []

            # Check parents. Cover 1) and 2)
            x = obj
            while x.is_signal():
                if x in write_upblks:
                x = x.get_parent_object()

            # Check the sibling slices. Cover 3)
            if obj.is_signal():
                for x in obj.get_sibling_slices():
                    if x.slice_overlap(obj) and x in write_upblks:

            # Add all constraints
            for writer in writers:
                for wr_blk in write_upblks[writer]:
                    if wr_blk not in all_update_ff:
                        for rd_blk in rd_blks:
                            if wr_blk != rd_blk:
                                if rd_blk not in all_update_ff:
                                        (wr_blk, rd_blk))  # wr < rd default

        # Collect all objs that read the variable whose id is "write"
        # 1) WR A.b.b.b, A.b.b, A.b, A (detect 2-writer conflict)
        # 2) WR A.b.b.b   - RD A.b.b, A.b, A
        # 3) WR A.b[1:10] - RD A.b[1:10], A,b, A
        # 4) WR A.b[1:10], A.b[0:5], A.b[6] (detect 2-writer conflict)
        # "WR A.b[1:10] - RD A.b[0:5], A.b[6], A.b[8:11]" has been discovered

        for obj, wr_blks in write_upblks.items():
            readers = []

            # Check parents. Cover 2) and 3). 1) and 4) should be detected in elaboration
            x = obj
            while x.is_signal():
                if x in read_upblks:
                x = x.get_parent_object()

            # Add all constraints
            for wr_blk in wr_blks:
                if wr_blk not in all_update_ff:
                    for reader in readers:
                        for rd_blk in read_upblks[reader]:
                            if wr_blk != rd_blk:
                                if rd_blk not in all_update_ff:
                                        (wr_blk, rd_blk))  # wr < rd default

        all_constraints = {*expl_constraints}
        for (x, y) in impl_constraints:
            if (y, x) not in expl_constraints:  # no conflicting expl
                all_constraints.add((x, y))
        all_constraints = {*expl_constraints}

    # Process method constraints
    # I assume method don't call other methods here

    # Do bfs to find out all potential total constraints associated with
    # each method, direction conflicts, and incomplete constraints

    verbose = False

    if isinstance(s, ComponentLevel4):
        method_blks = defaultdict(set)

        if isinstance(s, ComponentLevel5):
            for writer, net in s._dsl.all_method_nets:
                for member in net:
                    if member is not writer:
                        assert member.method is None
                        member.method = writer.method

        # Collect each CalleePort/method is called in which update block
        # We use bounded method of CalleePort to identify each call
        for blk, calls in s._dsl.all_upblk_calls.items():
            if verbose: print("--", blk, calls)
            for call in calls:
                if isinstance(call, MethodPort):
                elif isinstance(call, (NonBlockingIfc, BlockingIfc)):

        # Put all M-related constraints into predecessor and successor dicts
        pred = defaultdict(set)
        succ = defaultdict(set)

        # We also pre-process M(x) == M(y) constraints into per-method
        # equivalence sets
        equiv = defaultdict(set)

        for (x, y, is_equal) in s._dsl.all_M_constraints:
            if verbose: print((x, y, is_equal))

            if isinstance(x, MethodPort):
                xx = x.method

            # We allow the user to call the interface directly in a non-blocking
            # interface, so if they do call it, we use the actual method within
            # the method field
            elif isinstance(x, (NonBlockingIfc, BlockingIfc)):
                xx = x.method.method

                xx = x

            if isinstance(y, MethodPort):
                yy = y.method

            elif isinstance(y, (NonBlockingIfc, BlockingIfc)):
                yy = y.method.method

                yy = y


            if is_equal:  # M(x) == M(y)

        for method, assoc_blks in method_blks.items():
            visited = {(method, 0)}
            Q = [(method, 0)]  # -1: pred, 0: don't know, 1: succ

            if verbose: print()
            while Q:
                (u, w) = Q.pop()
                if verbose: print((u, w))

                if u in equiv:
                    for v in equiv[u]:
                        if (v, w) not in visited:
                            visited.add((v, w))
                            Q.append((v, w))

                if w <= 0:
                    for v in pred[u]:

                        if v in all_upblks:
                            # Find total constraint (v < blk) by v < method_u < method_u'=blk
                            # INVALID if we have explicit constraint (blk < method_u)

                            for blk in assoc_blks:
                                if blk not in pred[u]:
                                    if v != blk:
                                        if verbose:
                                            print("w<=0, v is blk".center(10),
                                                  v, blk)
                                        if verbose:                                            print(v.__name__.center(25)," < ", \
                                        all_constraints.add((v, blk))

                            if v in method_blks:
                                # TODO Now I'm leaving incomplete dependency chain because I didn't close the circuit loop.
                                # E.g. I do port.wr() somewhere in __main__ to write to a port.

                                # Find total constraint (vb < blk) by vb=method_v < method_u=blk
                                # INVALID if we have explicit constraint (blk < method_v) or (method_u < vb)

                                v_blks = method_blks[v]
                                for vb in v_blks:
                                    if vb not in succ[u]:
                                        for blk in assoc_blks:
                                            if blk not in pred[v]:
                                                if vb != blk:
                                                    if verbose:
                                                            "w<=0, v is method"
                                                            .center(10), v,
                                                    if verbose:                                                        print(vb.__name__.center(25)," < ", \
                                                        (vb, blk))

                            if (v, -1) not in visited:
                                visited.add((v, -1))
                                     -1))  # ? < v < u < ... < method < blk_id

                if w >= 0:
                    for v in succ[u]:

                        if v in all_upblks:
                            # Find total constraint (blk < v) by blk=method_u' < method_u < v
                            # INVALID if we have explicit constraint (method_u < blk)

                            for blk in assoc_blks:
                                if blk not in succ[u]:
                                    if v != blk:
                                        if verbose:
                                            print("w>=0, v is blk".center(10),
                                                  blk, v)
                                        if verbose:                                            print(blk.__name__.center(25)," < ", \
                                        all_constraints.add((blk, v))

                            if v in method_blks:
                                # assert v in method_blks, "Incomplete elaboration, something is wrong! %s" % hex(v)
                                # TODO Now I'm leaving incomplete dependency chain because I didn't close the circuit loop.
                                # E.g. I do port.wr() somewhere in __main__ to write to a port.

                                # Find total constraint (blk < vb) by blk=method_u < method_v=vb
                                # INVALID if we have explicit constraint (vb < method_u) or (method_v < blk)

                                v_blks = method_blks[v]
                                for vb in v_blks:
                                    if not vb in pred[u]:
                                        for blk in assoc_blks:
                                            if not blk in succ[v]:
                                                if vb != blk:
                                                    if verbose:
                                                            "w>=0, v is method"
                                                            .center(10), blk,
                                                    if verbose:                                                        print(blk.__name__.center(25)," < ", \
                                                        (blk, vb))

                            if (v, 1) not in visited:
                                visited.add((v, 1))
                                     1))  # blk_id < method < ... < u < v < ?

    def make_double_buffer_func(s):

        strs = [
            f"{repr(x)}._flip()" for x in s._dsl.all_signals
            if x._dsl.needs_double_buffer

        if not strs:

            def no_double_buffer():

            return no_double_buffer

        src = """
    def double_buffer():
    """.format("\n      ".join(strs))
        local = locals()
        exec(py.code.Source(src).compile(), local)
        return local['double_buffer']

    # Construct the graph for update blocks

    vs = all_upblks
    if isinstance(s, ComponentLevel2):
        vs -= all_update_ff

    es = defaultdict(list)
    InD = {v: 0 for v in vs}

    for (u, v) in list(all_constraints):  # u -> v, always
        InD[v] += 1

    # Perform topological sort for a serial schedule.

    serial_schedule = []
    Q = [v for v in vs if not InD[v]]
    while Q:
        #  print Q
        u = Q.pop()
        for v in es[u]:
            InD[v] -= 1
            if not InD[v]:

    if len(serial_schedule) != len(vs):
        raise UpblkCyclicError(
            'Update blocks have cyclic dependencies.'
            '* Please consult update dependency graph for details.')

    if isinstance(s, ComponentLevel2):
        final_serial_schedule = list(all_update_ff)
        final_serial_schedule = serial_schedule

    assert final_serial_schedule, "No update block found in the model"

    if verbose:
        from graphviz import Digraph
        dot = Digraph()
        dot.graph_attr["rank"] = "same"
        dot.graph_attr["ratio"] = "compress"
        dot.graph_attr["margin"] = "0.1"

        for x in vs:
            dot.node(x.__name__, shape="box")

        for (x, y) in all_constraints:
            dot.edge(x.__name__, y.__name__)

        dot.render("/tmp/upblk-dag.gv", view=True)

    def tick_normal():
        for blk in final_serial_schedule:

    s.tick = tick_normal
    s._dsl.schedule = final_serial_schedule

    # Clean up Signals

    def cleanup_signals(m):
        if isinstance(m, list):
            for i, o in enumerate(m):
                if isinstance(o, Signal):
                    m[i] = o.default_value()
                    m[i] <<= o.default_value()

        elif isinstance(m, NamedObject):
            for name, obj in m.__dict__.items():
                if isinstance(name, str) and name[0] != '_':
                    if isinstance(obj, Signal):
                        value = obj.default_value()
                        value <<= obj.default_value()
                        setattr(m, name, value)


    def create_reset(top):
        def reset():
            top.reset = Bits1(1)
            top.reset = Bits1(0)

        return reset

    s.sim_reset = create_reset(s)
Пример #4
  def schedule_intra_cycle( self, top ):

    # Construct the intra-cycle graph based on normal update blocks

    V   = top._dag.final_upblks - top.get_all_update_ff()

    G   = { v: [] for v in V }
    G_T = { v: [] for v in V } # transpose graph

    E = set()
    for (u, v) in top._dag.all_constraints: # u -> v
      if u in V and v in V:
        G  [u].append( v )
        G_T[v].append( u )
        E.add( (u, v) )

    if 'MAMBA_DAG' in os.environ:
      dump_dag( top, V, E )

    # Compute SCC using Kosaraju's algorithm

    SCCs, G_new = kosaraju_scc( G, G_T )

    # Perform topological sort on SCCs

    InD = { i: 0 for i in range(len(SCCs)) }
    for u, vs in G_new.items():
      for v in vs:
        InD[ v ] += 1

    scc_pred = {}
    scc_schedule = []

    Q = deque( [ i for i in range(len(SCCs)) if not InD[i] ] )
    for x in Q:
      scc_pred[ x ] = None

    while Q:
      u = Q.pop()
      scc_schedule.append( u )
      for v in G_new[u]:
        InD[v] -= 1
        if not InD[v]:
          Q.append( v )
          scc_pred[ v ] = u

    assert len(scc_schedule) == len(SCCs)

    # Now we generate super blocks for each SCC and produce final schedule

    constraint_objs = top._dag.constraint_objs
    onces = top.get_all_update_once()

    # Put the graph schedule to _sched
    top._sched.update_schedule = schedule = []

    scc_id = 0
    for i in scc_schedule:
      scc = SCCs[i]
      if len(scc) == 1:
        schedule.append( list(scc)[0] )

        # For each non-trivial SCC, we need to figure out a intra-SCC
        # linear schedule that minimizes the time to re-execute this SCC
        # due to value changes. A bad schedule may inefficiently execute
        # the SCC for many times, each of which changes a few signals.
        # The current algorithm iteratively finds the "entry block" of
        # the SCC and expand its adjancent blocks. The implementation is
        # to first find the actual entry point, and then BFS to expand the
        # footprint until all nodes are visited.

        # check update_once first
        for x in scc:
          if x in onces:
            raise UpblkCyclicError("update_once blocks are not allowed to appear in a cycle. \n - " + \
                            "\n - ".join( [
                              f"{y.__name__} ({'@update_once' if y in onces else '@update'} " \
                              f"in 'top.{repr(top.get_update_block_host_component(y))[2:]}')"
                              for y in scc] ))

        tmp_schedule = []
        Q = deque()

        if scc_pred[i] is None:
          # We start bfs from the block that has the least number of input
          # edges in the SCC
          InD = { v: 0 for v in scc }
          for (u, v) in E: # u -> v
            if u in scc and v in scc:
              InD[ v ] += 1
          Q.append( max(InD, key=InD.get) )

          # We start bfs with the blocks that are successors of the
          # predecessor scc in the previous SCC-level topological sort.
          pred = set( SCCs[ scc_pred[i] ] )
          # Sort by names for a fixed outcome
          for x in sorted( scc, key = lambda x: x.__name__ ):
            for v in G_T[x]: # find reversed edges point back to pred SCC
              if v in pred:
                Q.append( x )

        # Perform bfs to find a heuristic schedule
        visited = set(Q)
        while Q:
          u = Q.popleft()
          tmp_schedule.append( u )
          for v in G[u]:
            if v in scc and v not in visited:
              Q.append( v )
              visited.add( v )

        scc_id += 1
        variables = set()
        for (u, v) in E:
          # Collect all variables that triggers other blocks in the SCC
          if u in scc and v in scc:
            variables.update( constraint_objs[ (u, v) ] )

        if len(variables) == 0:
          raise UpblkCyclicError("There is a cyclic dependency without involving variables."
                          "Probably a loop that involves blocks that should be update_once:\n{}"\
                          .format(", ".join( [ x.__name__ for x in scc] )))

        # generate a loop for scc
        # Shunning: we just simply loop over the whole SCC block
        # TODO performance optimizations using Mamba techniques within a SCC block

        def gen_wrapped_SCCblk( s, scc, src ):

          # TODO mamba?
          scc_tick_func = SimpleTickPass.gen_tick_function( scc )
          _globals = { 's': s, 'scc_tick_func': scc_tick_func, 'deepcopy': deepcopy,
                       'UpblkCyclicError': UpblkCyclicError }
          _locals  = {}

          custom_exec(py.code.Source( src ).compile(), _globals, _locals)
          return _locals[ 'generated_block' ]

        template = """
def wrapped_SCC_{0}():
  N = 0
  while True:
    N += 1
    if N > 100:
      raise UpblkCyclicError("Combinational loop detected at runtime in {{{3}}} after 100 iters!")
    # print( "SCC block{0} is executed", num_iters, "times" )
generated_block = wrapped_SCC_{0}

        copy_srcs  = []
        check_srcs = []
        # print_srcs = []

        # clean up non-top variables if top is there. remove slices

        final_variables = set()

        for x in sorted( variables, key=repr ):
          w = x.get_top_level_signal()
          if w is x:
            final_variables.add( x )

          # w is not x
          if issubclass( w._dsl.Type, Bits ):
            if w not in final_variables:
              final_variables.add( w )
          elif is_bitstruct_class( w._dsl.Type ):
            if w not in final_variables:
              final_variables.add( x )
            final_variables.add( x )

        # group them by host component so that we create less bytecode

        final_var_host = defaultdict(list)
        for x in final_variables:
          final_var_host[ x.get_host_component() ].append( x )

        # create a block of copy/check code for each host component. Need
        # to allocate global var_id across different host components.

        var_id = 0
        for host, var_list in final_var_host.items():

          copy_srcs .append( f"host={host!r}" )
          check_srcs.append( f"host={host!r}" )

          sub_check_srcs = []

          hostlen = len(repr(host))
          for var in var_list:
            var_id += 1
            subname = repr(var)[hostlen+1:]
            if issubclass( var._dsl.Type, Bits ):     copy_srcs.append( f"t{var_id}=host.{subname}.clone()" )
            elif is_bitstruct_class( var._dsl.Type ): copy_srcs.append( f"t{var_id}=host.{subname}.clone()" )
            else:                                     copy_srcs.append( f"t{var_id}=deepcopy(host.{subname})" )

            sub_check_srcs.append( f"host.{subname} != t{var_id}" )

          check_srcs.append( f"if { ' or '.join(sub_check_srcs)}: continue" )

        scc_block_src = template.format( scc_id, "; ".join( copy_srcs ), "\n    ".join( check_srcs ),
                                         ", ".join( [ x.__name__ for x in scc] ) )

        # print(scc_block_src)
        schedule.append( gen_wrapped_SCCblk( top, tmp_schedule, scc_block_src ) )