예제 #1
0
    def elaborate(self, platform: Platform) -> Module:
        m = Module()

        triggers      = [Record.like(self.tdata1.read) for _ in range(self.ntriggers)]
        triggers_data = [Record.like(self.tdata2.read) for _ in range(self.ntriggers)]

        for t in triggers:
            m.d.comb += t.type.eq(TriggerType.MATCH)  # support only address/data match

        # handle writes to tselect
        with m.If(self.tselect.we):
            with m.If(self.tselect.write < self.ntriggers):  # no more than ntriggers
                m.d.sync += self.tselect.read.eq(self.tselect.write)

        # select the trigger
        with m.Switch(self.tselect.read):
            for idx, (trigger, trigger_data) in enumerate(zip(triggers, triggers_data)):
                with m.Case(idx):
                    m.d.comb += [
                        self.tdata1.read.eq(trigger),      # trigger visible @tdata1
                        self.tdata2.read.eq(trigger_data)  # data visible @tdata2
                    ]
                    # handle writes to tdata1
                    with m.If(self.tdata1.we):
                        mcontrol = Record([('i', mcontrol_layout), ('o', mcontrol_layout)])
                        m.d.comb += [
                            mcontrol.i.eq(self.tdata1.write.data),  # casting
                            mcontrol.o.execute.eq(mcontrol.i.execute),
                            mcontrol.o.store.eq(mcontrol.i.store),
                            mcontrol.o.load.eq(mcontrol.i.load),
                            mcontrol.o.m.eq(mcontrol.i.m),
                            mcontrol.o.u.eq(mcontrol.i.u),
                            mcontrol.o.action.eq(mcontrol.i.action)
                        ]
                        m.d.sync += [
                            trigger.dmode.eq(self.tdata1.write.dmode),
                            trigger.data.eq(mcontrol.o)
                        ]
                    # handle writes to tdata2
                    with m.If(self.tdata2.we):
                        m.d.sync += trigger_data.data.eq(self.tdata2.write)

        # trigger logic
        hit    = Signal()
        halt   = Signal()
        hit_v  = Signal(self.ntriggers)
        halt_v = Signal(self.ntriggers)

        for idx, (trigger, trigger_data) in enumerate(zip(triggers, triggers_data)):
            with m.Switch(trigger.type):
                with m.Case(TriggerType.MATCH):
                    match    = Signal()
                    mcontrol = Record(mcontrol_layout)
                    m.d.comb += mcontrol.eq(trigger)  # casting, lol

                    with m.If(mcontrol.execute):
                        m.d.comb += match.eq(self.x_valid & (trigger_data == self.x_pc))
                    with m.Elif(mcontrol.store):
                        m.d.comb += match.eq(self.x_valid & self.x_store & (trigger_data == self.x_bus_addr))
                    with m.Elif(mcontrol.load):
                        m.d.comb += match.eq(self.x_valid & self.x_load & (trigger_data == self.x_bus_addr))

                    if self.enable_user_mode:
                        # check the current priv mode, and check the priv enable mode
                        priv_m = self.privmode == PrivMode.Machine
                        priv_u = self.privmode == PrivMode.User
                        hit_tmp = match & ((mcontrol.m & priv_m) | (mcontrol.u & priv_u))
                    else:
                        hit_tmp = match & mcontrol.m

                    m.d.comb += [
                        hit_v[idx].eq(hit_tmp),
                        halt_v[idx].eq(mcontrol.action)
                    ]

        # request signals: halt/exception
        m.d.comb += [
            hit.eq(reduce(or_, hit_v, 0)),
            halt.eq(reduce(or_, halt_v, 0))
        ]
        with m.If(hit):
            with m.If(halt):  # halt = mcontrol.action
                m.d.comb += self.haltreq.eq(self.tdata1.read.dmode)  # enter debug mode only if dmode = 1
            with m.Else():
                m.d.comb += self.trap.eq(1)  # generate exception

        return m
예제 #2
0
    def elaborate(self, platform: Platform) -> Module:
        m = Module()

        snoop_addr = Record(self.pc_layout)
        snoop_valid = Signal()

        # -------------------------------------------------------------------------
        # Performance counter
        # TODO: connect to CSR's performance counter
        with m.If(~self.s1_stall & self.s1_valid & self.s1_access):
            m.d.sync += self.access_cnt.eq(self.access_cnt + 1)
        with m.If(self.s2_valid & self.s2_miss & ~self.bus_valid
                  & self.s2_access):
            m.d.sync += self.miss_cnt.eq(self.miss_cnt + 1)
        # -------------------------------------------------------------------------

        way_layout = [('data', 32 * self.nwords),
                      ('tag', self.s1_address.tag.shape()), ('valid', 1),
                      ('sel_lru', 1), ('snoop_hit', 1)]
        if self.enable_write:
            way_layout.append(('sel_we', 1))

        ways = Array(
            Record(way_layout, name='way_idx{}'.format(_way))
            for _way in range(self.nways))
        fill_cnt = Signal.like(self.s1_address.offset)

        # Check hit/miss
        way_hit = m.submodules.way_hit = Encoder(self.nways)
        for idx, way in enumerate(ways):
            m.d.comb += way_hit.i[idx].eq((way.tag == self.s2_address.tag)
                                          & way.valid)

        m.d.comb += self.s2_miss.eq(way_hit.n)
        if self.enable_write:
            # Asumiendo que hay un HIT, indicar que la vía que dió hit es en la cual se va a escribir
            m.d.comb += ways[way_hit.o].sel_we.eq(self.s2_we & self.s2_valid)

        # set the LRU
        if self.nways == 1:
            # One way: LRU is useless
            lru = Const(0)  # self.nlines
        else:
            # LRU es un vector de N bits, cada uno indicado el set a reemplazar
            # como NWAY es máximo 2, cada LRU es de un bit
            lru = Signal(self.nlines)
            _lru = lru.bit_select(self.s2_address.line, 1)
            write_ended = self.bus_valid & self.bus_ack & self.bus_last  # err ^ ack = = 1
            access_hit = ~self.s2_miss & self.s2_valid & (way_hit.o == _lru)
            with m.If(write_ended | access_hit):
                m.d.sync += _lru.eq(~_lru)

        # read data from the cache
        m.d.comb += self.s2_rdata.eq(ways[way_hit.o].data.word_select(
            self.s2_address.offset, 32))

        # Internal Snoop
        snoop_use_cache = Signal()
        snoop_tag_match = Signal()
        snoop_line_match = Signal()
        snoop_cancel_refill = Signal()
        if not self.enable_write:
            bits_range = log2_int(self.end_addr - self.start_addr,
                                  need_pow2=False)

            m.d.comb += [
                snoop_addr.eq(self.dcache_snoop.addr),  # aux
                snoop_valid.eq(self.dcache_snoop.we & self.dcache_snoop.valid
                               & self.dcache_snoop.ack),
                snoop_use_cache.eq(snoop_addr[bits_range:] == (
                    self.start_addr >> bits_range)),
                snoop_tag_match.eq(snoop_addr.tag == self.s2_address.tag),
                snoop_line_match.eq(snoop_addr.line == self.s2_address.line),
                snoop_cancel_refill.eq(snoop_use_cache & snoop_valid
                                       & snoop_line_match & snoop_tag_match),
            ]
        else:
            m.d.comb += snoop_cancel_refill.eq(0)

        with m.FSM():
            with m.State('READ'):
                with m.If(self.s2_re & self.s2_miss & self.s2_valid):
                    m.d.sync += [
                        self.bus_addr.eq(self.s2_address),
                        self.bus_valid.eq(1),
                        fill_cnt.eq(self.s2_address.offset - 1)
                    ]
                    m.next = 'REFILL'
            with m.State('REFILL'):
                m.d.comb += self.bus_last.eq(fill_cnt == self.bus_addr.offset)
                with m.If(self.bus_ack):
                    m.d.sync += self.bus_addr.offset.eq(self.bus_addr.offset +
                                                        1)
                with m.If(self.bus_ack & self.bus_last | self.bus_err):
                    m.d.sync += self.bus_valid.eq(0)
                with m.If(~self.bus_valid | self.s1_flush
                          | snoop_cancel_refill):
                    m.next = 'READ'
                    m.d.sync += self.bus_valid.eq(0)

        # mark the way to use (replace)
        m.d.comb += ways[lru.bit_select(self.s2_address.line,
                                        1)].sel_lru.eq(self.bus_valid)

        # generate for N ways
        for way in ways:
            # create the memory structures for valid, tag and data.
            valid = Signal(self.nlines)  # Valid bits

            tag_m = Memory(width=len(way.tag), depth=self.nlines)  # tag memory
            tag_rp = tag_m.read_port()
            snoop_rp = tag_m.read_port()
            tag_wp = tag_m.write_port()
            m.submodules += tag_rp, tag_wp, snoop_rp

            data_m = Memory(width=len(way.data),
                            depth=self.nlines)  # data memory
            data_rp = data_m.read_port()
            data_wp = data_m.write_port(
                granularity=32
            )  # implica que solo puedo escribir palabras de 32 bits.
            m.submodules += data_rp, data_wp

            # handle valid
            with m.If(self.s1_flush & self.s1_valid):  # flush
                m.d.sync += valid.eq(0)
            with m.Elif(way.sel_lru & self.bus_last
                        & self.bus_ack):  # refill ok
                m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(1)
            with m.Elif(way.sel_lru & self.bus_err):  # refill error
                m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(0)
            with m.Elif(self.s2_evict & self.s2_valid
                        & (way.tag == self.s2_address.tag)):  # evict
                m.d.sync += valid.bit_select(self.s2_address.line, 1).eq(0)

            # assignments
            m.d.comb += [
                tag_rp.addr.eq(
                    Mux(self.s1_stall, self.s2_address.line,
                        self.s1_address.line)),
                tag_wp.addr.eq(self.bus_addr.line),
                tag_wp.data.eq(self.bus_addr.tag),
                tag_wp.en.eq(way.sel_lru & self.bus_ack & self.bus_last),
                data_rp.addr.eq(
                    Mux(self.s1_stall, self.s2_address.line,
                        self.s1_address.line)),
                way.data.eq(data_rp.data),
                way.tag.eq(tag_rp.data),
                way.valid.eq(valid.bit_select(self.s2_address.line, 1))
            ]

            # update cache: CPU or Refill
            # El puerto de escritura se multiplexa debido a que la memoria solo puede tener un
            # puerto de escritura.
            if self.enable_write:
                update_addr = Signal(len(data_wp.addr))
                update_data = Signal(len(data_wp.data))
                update_we = Signal(len(data_wp.en))
                aux_wdata = Signal(32)

                with m.If(self.bus_valid):
                    m.d.comb += [
                        update_addr.eq(self.bus_addr.line),
                        update_data.eq(Repl(self.bus_data, self.nwords)),
                        update_we.bit_select(self.bus_addr.offset,
                                             1).eq(way.sel_lru & self.bus_ack),
                    ]
                with m.Else():
                    m.d.comb += [
                        update_addr.eq(self.s2_address.line),
                        update_data.eq(Repl(aux_wdata, self.nwords)),
                        update_we.bit_select(self.s2_address.offset,
                                             1).eq(way.sel_we & ~self.s2_miss)
                    ]
                m.d.comb += [
                    # Aux data: no tengo granularidad de byte en el puerto de escritura. Así que para el
                    # caso en el cual el CPU tiene que escribir, hay que construir el dato (wrord) a reemplazar
                    aux_wdata.eq(
                        Cat(
                            Mux(self.s2_sel[0],
                                self.s2_wdata.word_select(0, 8),
                                self.s2_rdata.word_select(0, 8)),
                            Mux(self.s2_sel[1],
                                self.s2_wdata.word_select(1, 8),
                                self.s2_rdata.word_select(1, 8)),
                            Mux(self.s2_sel[2],
                                self.s2_wdata.word_select(2, 8),
                                self.s2_rdata.word_select(2, 8)),
                            Mux(self.s2_sel[3],
                                self.s2_wdata.word_select(3, 8),
                                self.s2_rdata.word_select(3, 8)))),
                    #
                    data_wp.addr.eq(update_addr),
                    data_wp.data.eq(update_data),
                    data_wp.en.eq(update_we),
                ]
            else:
                m.d.comb += [
                    data_wp.addr.eq(self.bus_addr.line),
                    data_wp.data.eq(Repl(self.bus_data, self.nwords)),
                    data_wp.en.bit_select(self.bus_addr.offset,
                                          1).eq(way.sel_lru & self.bus_ack),
                ]

                # --------------------------------------------------------------
                # intenal snoop
                # for FENCE.i instruction
                _match_snoop = Signal()

                m.d.comb += [
                    snoop_rp.addr.eq(snoop_addr.line),  # read tag memory
                    _match_snoop.eq(snoop_rp.data == snoop_addr.tag),
                    way.snoop_hit.eq(snoop_use_cache & snoop_valid
                                     & _match_snoop
                                     & valid.bit_select(snoop_addr.line, 1)),
                ]
                # check is the snoop match a write from this core
                with m.If(way.snoop_hit):
                    m.d.sync += valid.bit_select(snoop_addr.line, 1).eq(0)
                # --------------------------------------------------------------

        return m
예제 #3
0
파일: lsu.py 프로젝트: ImArcangel/Bellatrix
    def elaborate(self, platform):
        m = Module()

        wbuffer_layout = [("addr", 32), ("data", 32), ("sel", 4)]

        wbuffer_din = Record(wbuffer_layout)
        wbuffer_dout = Record(wbuffer_layout)

        dcache = m.submodules.dcache = Cache(nlines=self.nlines,
                                             nwords=self.nwords,
                                             nways=self.nways,
                                             start_addr=self.start_addr,
                                             end_addr=self.end_addr,
                                             enable_write=True)
        arbiter = m.submodules.arbiter = Arbiter()
        wbuffer = m.submodules.wbuffer = SyncFIFOBuffered(
            width=len(wbuffer_din), depth=self.nwords)

        wbuffer_port = arbiter.add_port(priority=0)
        cache_port = arbiter.add_port(priority=1)
        bare_port = arbiter.add_port(priority=2)

        x_use_cache = Signal()
        m_use_cache = Signal()
        m_data_w = Signal(32)
        m_byte_sel = Signal(4)

        bits_range = log2_int(self.end_addr - self.start_addr, need_pow2=False)
        m.d.comb += x_use_cache.eq(
            (self.x_addr[bits_range:] == (self.start_addr >> bits_range)))

        with m.If(~self.x_stall):
            m.d.sync += [
                m_use_cache.eq(x_use_cache),
                m_data_w.eq(self.x_data_w),
                m_byte_sel.eq(self.x_byte_sel)
            ]
        m.d.comb += arbiter.bus.connect(self.dport)

        # --------------------------------------------------
        # write buffer IO
        m.d.comb += [
            # input
            wbuffer.w_data.eq(wbuffer_din),
            wbuffer.w_en.eq(x_use_cache & self.x_store & self.x_valid
                            & ~self.x_stall),
            wbuffer_din.addr.eq(self.x_addr),
            wbuffer_din.data.eq(self.x_data_w),
            wbuffer_din.sel.eq(self.x_byte_sel),
            # output
            wbuffer_dout.eq(wbuffer.r_data),
        ]

        # drive the arbiter port
        with m.If(wbuffer_port.cyc):
            with m.If(wbuffer_port.ack | wbuffer_port.err):
                m.d.comb += wbuffer.r_en.eq(1)
                m.d.sync += wbuffer_port.stb.eq(0)
                with m.If(wbuffer.level == 1):  # Buffer is empty
                    m.d.sync += [wbuffer_port.cyc.eq(0), wbuffer_port.we.eq(0)]
            with m.Elif(~wbuffer_port.stb):
                m.d.sync += [
                    wbuffer_port.stb.eq(1),
                    wbuffer_port.addr.eq(wbuffer_dout.addr),
                    wbuffer_port.dat_w.eq(wbuffer_dout.data),
                    wbuffer_port.sel.eq(wbuffer_dout.sel)
                ]
        with m.Elif(wbuffer.r_rdy):
            m.d.sync += [
                wbuffer_port.cyc.eq(1),
                wbuffer_port.stb.eq(1),
                wbuffer_port.we.eq(1),
                wbuffer_port.addr.eq(wbuffer_dout.addr),
                wbuffer_port.dat_w.eq(wbuffer_dout.data),
                wbuffer_port.sel.eq(wbuffer_dout.sel)
            ]
            m.d.comb += wbuffer.r_en.eq(0)
        m.d.comb += [
            wbuffer_port.cti.eq(CycleType.CLASSIC),
            wbuffer_port.bte.eq(0)
        ]

        # --------------------------------------------------
        # connect IO: cache
        m.d.comb += [
            dcache.s1_address.eq(self.x_addr),
            dcache.s1_flush.eq(0),
            dcache.s1_valid.eq(self.x_valid),
            dcache.s1_stall.eq(self.x_stall),
            dcache.s2_address.eq(self.m_addr),
            dcache.s2_evict.eq(0),  # Evict is not used. Remove maybe?
            dcache.s2_valid.eq(self.m_valid),
            dcache.s2_re.eq(self.m_load),
            dcache.s2_wdata.eq(m_data_w),
            dcache.s2_sel.eq(m_byte_sel),
            dcache.s2_we.eq(self.m_store)
        ]

        # connect cache to arbiter
        m.d.comb += [
            cache_port.addr.eq(dcache.bus_addr),
            cache_port.dat_w.eq(0),
            cache_port.sel.eq(0),
            cache_port.we.eq(0),
            cache_port.cyc.eq(dcache.bus_valid),
            cache_port.stb.eq(dcache.bus_valid),
            cache_port.cti.eq(
                Mux(dcache.bus_last, CycleType.END, CycleType.INCREMENT)),
            cache_port.bte.eq(log2_int(self.nwords) - 1),
            dcache.bus_data.eq(cache_port.dat_r),
            dcache.bus_ack.eq(cache_port.ack),
            dcache.bus_err.eq(cache_port.err)
        ]

        # --------------------------------------------------
        # bare port
        rdata = Signal.like(bare_port.dat_r)
        op = Signal()

        m.d.comb += op.eq(self.x_load | self.x_store)

        # transaction logic
        with m.If(bare_port.cyc):
            with m.If(bare_port.ack | bare_port.err | ~self.m_valid):
                m.d.sync += [
                    rdata.eq(bare_port.dat_r),
                    bare_port.we.eq(0),
                    bare_port.cyc.eq(0),
                    bare_port.stb.eq(0)
                ]
        with m.Elif(op & self.x_valid & ~self.x_stall & ~x_use_cache):
            m.d.sync += [
                bare_port.addr.eq(self.x_addr),
                bare_port.dat_w.eq(self.x_data_w),
                bare_port.sel.eq(self.x_byte_sel),
                bare_port.we.eq(self.x_store),
                bare_port.cyc.eq(1),
                bare_port.stb.eq(1)
            ]
        m.d.comb += [bare_port.cti.eq(CycleType.CLASSIC), bare_port.bte.eq(0)]

        # --------------------------------------------------
        # extra logic
        with m.If(self.x_fence_i):
            m.d.comb += self.x_busy.eq(wbuffer.r_rdy)
        with m.Elif(x_use_cache):
            m.d.comb += self.x_busy.eq(self.x_store & ~wbuffer.w_rdy)
        with m.Else():
            m.d.comb += self.x_busy.eq(bare_port.cyc)

        with m.If(m_use_cache):
            m.d.comb += [
                self.m_busy.eq(dcache.s2_re & dcache.s2_miss),
                self.m_load_data.eq(dcache.s2_rdata)
            ]
        with m.Elif(self.m_load_error | self.m_store_error):
            m.d.comb += [self.m_busy.eq(0), self.m_load_data.eq(0)]
        with m.Else():
            m.d.comb += [
                self.m_busy.eq(bare_port.cyc),
                self.m_load_data.eq(rdata)
            ]

        # --------------------------------------------------
        # exceptions
        with m.If(self.dport.cyc & self.dport.err):
            m.d.sync += [
                self.m_load_error.eq(~self.dport.we),
                self.m_store_error.eq(self.dport.we),
                self.m_badaddr.eq(self.dport.addr)
            ]
        with m.Elif(~self.m_stall):
            m.d.sync += [self.m_load_error.eq(0), self.m_store_error.eq(0)]

        return m
예제 #4
0
    def elaborate(self, platform):
        m = Module()
        size = self.configuration.getOption('predictor', 'size')
        if size == 0 or (size & (size - 1)):
            raise ValueError(f'size must be a power of 2: {size}')

        _bits_index = log2_int(size)
        _bits_tag = 32 - _bits_index
        _btb_width = 1 + 32 + _bits_tag  # valid + data + tag
        _btb_depth = 1 << _bits_index

        _btb_layout = [('target', 32), ('tag', _bits_tag), ('valid', 1)]

        _pc_layout = [('index', _bits_index), ('tag', _bits_tag)]

        btb = Memory(width=_btb_width, depth=_btb_depth)
        btb_rp = btb.read_port()
        btb_wp = btb.write_port()

        bht = Memory(width=2, depth=_btb_depth)
        bht_rp = bht.read_port()
        bht_wp = bht.write_port()

        m.submodules += btb_rp, btb_wp
        m.submodules += bht_rp, bht_wp

        btb_r = Record(_btb_layout)
        a_pc = Record(_pc_layout)
        f_pc = Record(_pc_layout)
        m_pc = Record(_pc_layout)
        hit = Signal()
        pstate_next = Signal(2)

        m.d.comb += [
            btb_rp.addr.eq(Mux(self.a_stall, f_pc.index, a_pc.index)),
            bht_rp.addr.eq(Mux(self.a_stall, f_pc.index, a_pc.index)),
            btb_r.eq(btb_rp.data),
            #
            a_pc.eq(self.a_pc),
            f_pc.eq(self.f_pc),
            hit.eq(btb_r.valid & (btb_r.tag == f_pc.tag)),
            #
            self.f_prediction.eq(hit & bht_rp.data[1]),
            self.f_prediction_state.eq(bht_rp.data),
            self.f_prediction_pc.eq(btb_r.target)
        ]

        # update
        m.d.comb += [
            btb_wp.addr.eq(m_pc.index),
            btb_wp.data.eq(Cat(self.m_target_pc, m_pc.tag, 1)),
            btb_wp.en.eq(self.m_update),
            bht_wp.addr.eq(m_pc.index),
            bht_wp.data.eq(pstate_next),
            bht_wp.en.eq(self.m_update),
            m_pc.eq(self.m_pc),
            pstate_next.eq(0)
        ]

        with m.Switch(Cat(self.m_prediction_state, self.m_take_jmp_branch)):
            with m.Case(0b000, 0b001):
                m.d.comb += pstate_next.eq(0b00)
            with m.Case(0b010, 0b100):
                m.d.comb += pstate_next.eq(0b01)
            with m.Case(0b011, 0b101):
                m.d.comb += pstate_next.eq(0b10)
            with m.Case(0b110, 0b111):
                m.d.comb += pstate_next.eq(0b11)

        return m
예제 #5
0
class Cache(Elaboratable):
    def __init__(self, nlines, nwords, nways, start_addr=0, end_addr=2**32, enable_write=True):
        if nlines == 0 or (nlines & (nlines - 1)):
            raise ValueError(f'nlines must be a power of 2: {nlines}')
        if nwords not in (4, 8, 16):
            raise ValueError(f'nwords must be 4, 8 or 16: {nwords}')
        if nways not in (1, 2):
            raise ValueError(f'nways must be 1 or 2: {nways}')

        self.enable_write = enable_write
        self.nlines       = nlines
        self.nwords       = nwords
        self.nways        = nways
        offset_bits       = log2_int(nwords)
        line_bits         = log2_int(nlines)
        addr_bits         = log2_int(end_addr - start_addr, need_pow2=False)
        tag_bits          = addr_bits - line_bits - offset_bits - 2  # -2 because word line.
        extra_bits        = 32 - tag_bits - line_bits - offset_bits - 2

        pc_layout = [
            ('byte',   2),
            ('offset', offset_bits),
            ('line',   line_bits),
            ('tag',    tag_bits)
        ]
        if (extra_bits != 0):
            pc_layout.append(('unused', extra_bits))

        self.s1_address = Record(pc_layout)
        self.s1_flush   = Signal()
        self.s1_valid   = Signal()
        self.s1_stall   = Signal()
        self.s2_address = Record(pc_layout)
        self.s2_evict   = Signal()
        self.s2_valid   = Signal()
        self.s2_miss    = Signal()
        self.s2_rdata   = Signal(32)
        self.s2_re      = Signal()
        if enable_write:
            self.s2_wdata = Signal(32)
            self.s2_sel   = Signal(4)
            self.s2_we    = Signal()

        self.bus_addr  = Record(pc_layout)
        self.bus_valid = Signal()
        self.bus_last  = Signal()
        self.bus_data  = Signal(32)
        self.bus_ack   = Signal()
        self.bus_err   = Signal()

    def elaborate(self, platform):
        m = Module()

        way_layout = [
            ('data',     32 * self.nwords),
            ('tag',      self.s1_address.tag.shape()),
            ('valid',    1),
            ('sel_lru',  1)
        ]
        if self.enable_write:
            way_layout.append(('sel_we',   1))

        ways     = Array(Record(way_layout) for _way in range(self.nways))
        fill_cnt = Signal.like(self.s1_address.offset)
        # set the LRU
        if self.nways == 1:
            lru = Const(0)  # self.nlines
        else:
            lru = Signal(self.nlines)
            with m.If(self.bus_valid & self.bus_ack & self.bus_last):  # err ^ ack == 1
                _lru = lru.bit_select(self.s2_address.line, 1)
                m.d.sync += lru.bit_select(self.s2_address.line, 1).eq(~_lru)

        # hit/miss
        way_hit = m.submodules.way_hit = Encoder(self.nways)
        for idx, way in enumerate(ways):
            m.d.comb += way_hit.i[idx].eq((way.tag == self.s2_address.tag) & way.valid)

        m.d.comb += self.s2_miss.eq(way_hit.n)
        if self.enable_write:
            m.d.comb += ways[way_hit.o].sel_we.eq(self.s2_we & self.s2_valid)

        # read data
        m.d.comb += self.s2_rdata.eq(ways[way_hit.o].data.word_select(self.s2_address.offset, 32))

        with m.FSM():
            with m.State('READ'):
                with m.If(self.s2_re & self.s2_miss & self.s2_valid):
                    m.d.sync += [
                        self.bus_addr.eq(self.s2_address),  # WARNING extra_bits
                        self.bus_valid.eq(1),
                        fill_cnt.eq(self.s2_address.offset - 1)
                    ]
                    m.next = 'REFILL'
            with m.State('REFILL'):
                m.d.comb += self.bus_last.eq(fill_cnt == self.bus_addr.offset)
                with m.If(self.bus_ack):
                    m.d.sync += self.bus_addr.offset.eq(self.bus_addr.offset + 1)
                with m.If(self.bus_ack & self.bus_last | self.bus_err):
                    m.d.sync += self.bus_valid.eq(0)
                with m.If(~self.bus_valid | self.s1_flush):
                    # in case of flush, abort ongoing refill.
                    m.next = 'READ'
                    m.d.sync += self.bus_valid.eq(0)

        # mark the way to use (replace)
        m.d.comb += ways[lru.bit_select(self.s2_address.line, 1)].sel_lru.eq(self.bus_valid)

        # generate for N ways
        for way in ways:
            # create the memory structures for valid, tag and data.
            valid = Signal(self.nlines)

            tag_m  = Memory(width=len(way.tag), depth=self.nlines)
            tag_rp = tag_m.read_port()
            tag_wp = tag_m.write_port()
            m.submodules += tag_rp, tag_wp

            data_m  = Memory(width=len(way.data), depth=self.nlines)
            data_rp = data_m.read_port()
            data_wp = data_m.write_port(granularity=32)
            m.submodules += data_rp, data_wp

            # handle valid
            with m.If(self.s1_flush & self.s1_valid):  # flush
                m.d.sync += valid.eq(0)
            with m.Elif(way.sel_lru & self.bus_last & self.bus_ack):  # refill ok
                m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(1)
            with m.Elif(way.sel_lru & self.bus_err):  # refill error
                m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(0)
            with m.Elif(self.s2_evict & self.s2_valid & (way.tag == self.s2_address.tag)):  # evict
                m.d.sync += valid.bit_select(self.s2_address.line, 1).eq(0)

            # assignments
            m.d.comb += [
                tag_rp.addr.eq(Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)),
                tag_wp.addr.eq(self.bus_addr.line),
                tag_wp.data.eq(self.bus_addr.tag),
                tag_wp.en.eq(way.sel_lru & self.bus_ack & self.bus_last),

                data_rp.addr.eq(Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)),

                way.data.eq(data_rp.data),
                way.tag.eq(tag_rp.data),
                way.valid.eq(valid.bit_select(self.s2_address.line, 1))
            ]

            # update cache: CPU or Refill
            if self.enable_write:
                update_addr = Signal(len(data_wp.addr))
                update_data = Signal(len(data_wp.data))
                update_we   = Signal(len(data_wp.en))
                aux_wdata   = Signal(32)

                with m.If(self.bus_valid):
                    m.d.comb += [
                        update_addr.eq(self.bus_addr.line),
                        update_data.eq(Repl(self.bus_data, self.nwords)),
                        update_we.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack),
                    ]
                with m.Else():
                    m.d.comb += [
                        update_addr.eq(self.s2_address.line),
                        update_data.eq(Repl(aux_wdata, self.nwords)),
                        update_we.bit_select(self.s2_address.offset, 1).eq(way.sel_we & ~self.s2_miss)
                    ]
                m.d.comb += [
                    aux_wdata.eq(Cat(
                        Mux(self.s2_sel[0], self.s2_wdata.word_select(0, 8), self.s2_rdata.word_select(0, 8)),
                        Mux(self.s2_sel[1], self.s2_wdata.word_select(1, 8), self.s2_rdata.word_select(1, 8)),
                        Mux(self.s2_sel[2], self.s2_wdata.word_select(2, 8), self.s2_rdata.word_select(2, 8)),
                        Mux(self.s2_sel[3], self.s2_wdata.word_select(3, 8), self.s2_rdata.word_select(3, 8))
                    )),
                    #
                    data_wp.addr.eq(update_addr),
                    data_wp.data.eq(update_data),
                    data_wp.en.eq(update_we),
                ]
            else:
                m.d.comb += [
                    data_wp.addr.eq(self.bus_addr.line),
                    data_wp.data.eq(Repl(self.bus_data, self.nwords)),
                    data_wp.en.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack),
                ]

        return m