예제 #1
0
def InputImageRAM(cirb, circuit, nextNodeInput, imgSrc, pxPerClock, parallelism = None):
    imgData = loadImage(imgSrc, pxPerClock)
    imgRAM = CoreirMem(cirb, imgData.numRows, imgData.bitsPerRow)

    # this counter ensures writing to correct address always
    writeCounter = SizedCounterModM(imgData.numRows, has_ce=True)
    # these counters ensure that emit pxPerClock for number of cycles that parallelism requries
    if parallelism is not None:
        readEnableCounter = SizedCounterModM(int(pxPerClock / parallelism), has_ce=True, cout=True)
        term = Term(cirb, len(readEnableCounter.O))
    readCounter = SizedCounterModM(imgData.numRows, has_ce=True)


    wire(writeCounter.O, imgRAM.waddr)
    wire(circuit.input_wdata, imgRAM.wdata)
    wire(readCounter.O, imgRAM.raddr)
    connectArraysAndArraysofArrays(imgRAM.rdata, nextNodeInput,
                                   pxPerClock, imgData.bitsPerPixel)
    if parallelism is not None:
        wire(circuit.input_ren, readEnableCounter.CE)
        wire(readEnableCounter.COUT, readCounter.CE)
        wire(readEnableCounter.O, term.I)
    else:
        wire(circuit.input_ren, readCounter.CE)
    wire(circuit.input_wen, imgRAM.wen)
    wire(circuit.input_wen, writeCounter.CE)

    return imgRAM
예제 #2
0
        def definition(fifo):
            pieces_of_elements = n * time_per_element
            if expand_max_by_one_clk:
                pieces_of_elements += 1
            read_counter = SizedCounterModM(pieces_of_elements,
                                            has_ce=True,
                                            has_reset=has_reset)
            write_counter = SizedCounterModM(pieces_of_elements,
                                             has_ce=True,
                                             has_reset=has_reset)
            # add 1 as 0 doesn't mean on first element, means 0 element, so also need pieces_of_elements to be
            # entry if all written
            num_stored_counter = CeilFloorUpDownCounter(pieces_of_elements + 1,
                                                        has_ce=has_ce,
                                                        has_reset=has_reset)

            # ready means can accept input.
            # Do this when the num_stored_counter is not at it's max value
            ready = ~Decode(pieces_of_elements, num_stored_counter.O.N)(
                num_stored_counter.O)
            # valid means can emit downstream
            # Do this when the num_stored_counter shows not empty
            valid = ~Decode(0, num_stored_counter.O.N)(num_stored_counter.O)

            # only assert these signals when CE is high or no CE
            if has_ce:
                ready = ready & bit(fifo.CE)
                valid = valid & bit(fifo.CE)
                wire(num_stored_counter.CE, fifo.CE)

            read_this_clk = valid & fifo.ready_down
            write_this_clk = ready & fifo.valid_up
            wire(read_counter.CE, read_this_clk)
            wire(write_counter.CE, write_this_clk)
            wire(num_stored_counter.U, write_this_clk)
            wire(num_stored_counter.D, read_this_clk)

            if has_reset:
                wire(fifo.RESET, read_counter.RESET)
                wire(fifo.RESET, write_counter.RESET)
                wire(fifo.RESET, num_stored_counter.RESET)

            value_store = DefineRAMAnyType(T, pieces_of_elements)()
            wire(value_store.WADDR, write_counter.O)
            wire(value_store.RADDR, read_counter.O)
            wire(value_store.WDATA, fifo.I)
            wire(value_store.RDATA, fifo.O)
            wire(value_store.WE, write_this_clk)

            wire(valid, fifo.valid_down)
            wire(ready, fifo.ready_up)
예제 #3
0
        def definition(downsampleSequential):
            element_idx_counter = SizedCounterModM(n,
                                                   has_ce=True,
                                                   has_reset=has_reset)
            emit_cur_element = Decode(idx, element_idx_counter.O.N)(
                element_idx_counter.O)

            # enabled means run the circuit
            # do this when upstream is ready, so have something to downsample,
            # and when have to emit current element and downstream is ready or don't have to emit current element
            enabled = downsampleSequential.valid_up & \
                      ((emit_cur_element & downsampleSequential.ready_down) | (~emit_cur_element))
            # ready means can accept input when get valid from upstream
            # ready when emit current element and downstream is ready or don't have to emit current element
            ready = (emit_cur_element
                     & downsampleSequential.ready_down) | (~emit_cur_element)
            # valid means can emit downstream
            # valid when emitting current element and upstream is providing valid input for element
            valid = emit_cur_element & downsampleSequential.valid_up

            if has_ce:
                enabled = enabled & bit(downsampleSequential.CE)
                ready = ready & bit(downsampleSequential.CE)
                valid = valid & bit(downsampleSequential.CE)

            if time_per_element > 1:
                time_per_element_counter = SizedCounterModM(
                    time_per_element, has_ce=True, has_reset=has_reset)
                go_to_next_element = Decode(time_per_element - 1,
                                            time_per_element_counter.O.N)(
                                                time_per_element_counter.O)

                wire(time_per_element_counter.CE, enabled)
                wire(element_idx_counter.CE, enabled & go_to_next_element)

                if has_reset:
                    wire(time_per_element_counter.RESET,
                         downsampleSequential.RESET)
                    wire(element_idx_counter.RESET, downsampleSequential.RESET)
            else:
                wire(element_idx_counter.CE, enabled)
                if has_reset:
                    wire(element_idx_counter.RESET, downsampleSequential.RESET)

            wire(downsampleSequential.I, downsampleSequential.O)
            wire(valid, downsampleSequential.valid_down)
            wire(ready, downsampleSequential.ready_up)
예제 #4
0
def OutputImageRAM(cirb, circuit, prevNodeOutput, writeValidSignal, imgSrc, pxPerClock):
    imgData = loadImage(imgSrc, pxPerClock)
    imgRAM = CoreirMem(cirb, imgData.numRows, imgData.bitsPerRow)

    # this counter ensures writing to correct address always
    writeCounter = SizedCounterModM(imgData.numRows, has_ce=True)
    # this counter ensures reading from the right address
    readCounter = SizedCounterModM(imgData.numRows, has_ce=True)

    wire(writeCounter.O, imgRAM.waddr)
    connectArraysAndArraysofArrays(imgRAM.wdata, prevNodeOutput,
                                   pxPerClock, imgData.bitsPerPixel)
    wire(readCounter.O, imgRAM.raddr)
    wire(imgRAM.rdata, circuit.output_rdata)
    wire(circuit.output_ren, readCounter.CE)
    wire(writeValidSignal, imgRAM.wen)
    wire(writeValidSignal, writeCounter.CE)
    return imgRAM
 def definition(cls):
     valid_counter = SizedCounterModM(num_clocks_delay + 1,
                                      has_ce=True,
                                      has_reset=has_reset)
     delay_const = DefineCoreirConst(len(valid_counter.O),
                                     num_clocks_delay)()
     if has_ce:
         wire(enable(bit(cls.CE) & (valid_counter.O < delay_const.O)),
              valid_counter.CE)
     else:
         wire(valid_counter.O < delay_const.O, valid_counter.CE)
     wire(valid_counter.O == delay_const.O, cls.valid)
     if has_reset:
         wire(cls.RESET, valid_counter.RESET)
def test_sized_counter_modm():
    args = ['O', Out(Array[2, Bit])] + ClockInterface(False, False)

    testcircuit = DefineCircuit('sized_counter_modm_test', *args)

    counter = SizedCounterModM(3, has_ce=True)
    wire(1, counter.CE)
    wire(testcircuit.O, counter.O)
    EndCircuit()

    #save_CoreIR_json(cirb, testcircuit, "sized_counter_modm.json")
    sim = CoreIRSimulator(testcircuit,
                          testcircuit.CLK,
                          namespaces=[
                              "aetherlinglib", "commonlib", "mantle", "coreir",
                              "global"
                          ])
예제 #7
0
        def definition(partition):
            dehydrate = MapParallel(arrayType.N, Dehydrate(partition.elementType))
            # each mux emits 1 element of the subset that is emitted every clock
            # each mux needs to handle k/s inputs, so it can output one element every clock
            muxes = MapParallel(subsetSize, CommonlibMuxN(int(arrayType.N / subsetSize),
                                              len(dehydrate.out[0])))
            hydrate = MapParallel(subsetSize, Hydrate(partition.elementType))
            counter = SizedCounterModM(int(arrayType.N/subsetSize), has_ce=has_ce)

            wire(partition.I, dehydrate.I)
            for i in range(subsetSize):
                # to the first mux wire 0, subsetSize, 2*subsetSize,...
                # so that each clock it emits the first element of the next subset
                # repeat for each mux so ith mux outputs ith element of subset each clock
                wire(dehydrate.out[i::subsetSize], muxes.I[i].data)
                wire(counter.O, muxes.I[i].sel)
            wire(muxes.out, hydrate.I)
            wire(hydrate.out, partition.O)
            if has_ce:
                wire(partition.CE, counter.CE)
def test_multiple_sipo_and_counter():
    args = ['I', In(Bit), 'O_sipo', Out(Array[4, Bit])] + [
        'O_counter', Out(Array[2, Bit])
    ] + ClockInterface(False, False)

    testcircuit = DefineCircuit('multiple_sipo_and_counter_test', *args)

    map_sipo = MapParallel(1, SIPO(4, 0, has_ce=True))
    wire(1, map_sipo.CE[0])
    wire(testcircuit.I, map_sipo.I[0])
    wire(testcircuit.O_sipo, map_sipo.O[0])

    counter = SizedCounterModM(3, has_ce=True)
    wire(1, counter.CE)
    wire(testcircuit.O_counter, counter.O)
    EndCircuit()

    #save_CoreIR_json(cirb, testcircuit, "multiple_sipo_and_counter.json")
    sim = CoreIRSimulator(testcircuit,
                          testcircuit.CLK,
                          namespaces=[
                              "aetherlinglib", "commonlib", "mantle", "coreir",
                              "global"
                          ])
예제 #9
0
        def definition(cls):
            lb = AnyDimensionalLineBuffer(
                pixel_type,
                [rows_of_pixels_per_clock, pixels_per_row_per_clock],
                [window_rows, window_cols], [image_rows, image_cols],
                [stride_rows, stride_cols], [origin_rows, origin_cols])
            wire(cls.I, lb.I)
            if stride_rows <= rows_of_pixels_per_clock:
                for row_of_windows in range(cls.rows_of_windows_per_clock):
                    for window_per_row in range(cls.windows_per_row_per_clock):
                        wire(
                            cls.O[row_of_windows *
                                  cls.windows_per_row_per_clock +
                                  window_per_row],
                            lb.O[row_of_windows][window_per_row])
                wire(cls.valid, lb.valid)

            else:
                if add_debug_interface:
                    for row_of_windows in range(cls.rows_of_windows_per_clock):
                        for window_per_row in range(
                                cls.windows_per_row_per_clock):
                            wire(
                                cls.undelayedO[row_of_windows *
                                               cls.windows_per_row_per_clock +
                                               window_per_row],
                                lb.O[row_of_windows][window_per_row])
                db = DelayedBuffer(Array[window_rows, Array[window_cols,
                                                            pixel_type]],
                                   image_cols // stride_cols,
                                   max(pixels_per_row_per_clock // stride_cols,
                                       1),
                                   cls.time_per_buffered_cycle,
                                   add_debug_interface=add_debug_interface)
                for row_of_windows in range(cls.rows_of_windows_per_clock):
                    for window_per_row in range(cls.windows_per_row_per_clock):
                        wire(
                            db.I[row_of_windows * cls.windows_per_row_per_clock
                                 + window_per_row],
                            lb.O[row_of_windows][window_per_row])
                wire(lb.valid, db.WE)
                wire(db.valid, cls.valid)
                wire(db.O, cls.O)

                # first time lb is valid, delayed buffer becomes
                # valid permanently
                first_valid_counter = SizedCounterModM(2, has_ce=True)
                zero_const = DefineCoreirConst(1, 0)()
                wire(lb.valid & (zero_const.O == first_valid_counter.O),
                     first_valid_counter.CE)
                # delay the CE of the delayed buffer as the LB output will hit the
                # DB one later, so give the DB that CE
                # this ensure sthat when using CE for a ready-valid chain, don't have to wait until
                delayed_ce_for_db_valid = DefineRegisterAnyType(Bit)()
                wire(bit(cls.CE), delayed_ce_for_db_valid.I)
                #ce_or_last_valid = bit(cls.CE) | (lb.valid & ~last_clock_lb_valid.O)
                # need lb.valid or counter as lb.valid will be 1 on first clock where valid
                # while counter will still be 0
                wire((lb.valid | first_valid_counter.O[0])
                     & delayed_ce_for_db_valid.O, db.CE)
                if add_debug_interface:
                    wire((lb.valid | first_valid_counter.O[0])
                         & delayed_ce_for_db_valid.O, cls.dbCE)
                    wire(lb.valid, cls.dbWE)
                    wire(db.WDATA, cls.WDATA)
                    wire(db.RDATA, cls.RDATA)
                    wire(db.WADDR, cls.WADDR)
                    wire(db.RADDR, cls.RADDR)
                    wire(db.RAMWE, cls.RAMWE)

            wire(cls.CE, lb.CE)
            wire(cls.ready, 1)
예제 #10
0
        def definition(cls):
            rams = DefineNativeMapParallel(k, DefineRAMAnyType(t, n // k))()

            # each clock WE is set, write to the RAMs and increment the address
            writing_location_per_bank = SizedCounterModM(n // k, has_ce=True)
            wire(cls.I, rams.WDATA)
            ramEnableWire = cls.WE & bit(cls.CE)
            if add_debug_interface:
                wire(cls.I, cls.WDATA)
                wire(ramEnableWire, cls.RAMWE)
            for i in range(k):
                wire(writing_location_per_bank.O, rams.WADDR[i])
                if add_debug_interface:
                    wire(writing_location_per_bank.O, cls.WADDR[i])
                wire(ramEnableWire, rams.WE[i])
            wire(cls.WE & bit(cls.CE), writing_location_per_bank.CE)

            if initial_emitting_delay > 0:
                initial_delay_counter = InitialDelayCounter(
                    initial_emitting_delay)
                ce_with_delay = bit(cls.CE) & initial_delay_counter.valid
            else:
                ce_with_delay = bit(cls.CE)

            # the bank ram counter tracks which group of entries in all the banked rams RADDR should be set to
            ticks_per_element = total_emitting_period // (n //
                                                          cls.out_per_clock)
            ticks_per_row_of_elements = ticks_per_element * (k //
                                                             cls.out_per_clock)
            # this completes a cycle ever time the current_element_per_bank_ram increments by 1
            if n // k == 1:
                current_element_per_banked_ram_counter = DefineCoreirConst(
                    1, 0)()
            elif ticks_per_row_of_elements == 1:
                current_element_per_banked_ram_counter = SizedCounterModM(
                    n // k, has_ce=True)
                wire(ce_with_delay, current_element_per_banked_ram_counter.CE)
            else:
                bank_ram_tick_counter = SizedCounterModM(
                    ticks_per_row_of_elements, has_ce=True)
                ticks_per_row_of_elements_const = DefineCoreirConst(
                    len(bank_ram_tick_counter.O),
                    ticks_per_row_of_elements - 1)()

                current_element_per_banked_ram_counter = SizedCounterModM(
                    n // k, has_ce=True)
                wire(ce_with_delay, bank_ram_tick_counter.CE)
                wire(
                    ce_with_delay & (bank_ram_tick_counter.O
                                     == ticks_per_row_of_elements_const.O),
                    current_element_per_banked_ram_counter.CE)

            for i in range(k):
                wire(current_element_per_banked_ram_counter.O, rams.RADDR[i])
                if add_debug_interface:
                    wire(current_element_per_banked_ram_counter.O,
                         cls.RADDR[i])

            # the mux bank selector counter tracks which of the banks to read from right now

            # divide the number of ticks per row by the number of mux outputs per row
            # (k // cls.out_per_clock) to get ticks per mux output
            outputs_per_row = k // cls.out_per_clock
            ticks_per_mux_output = ticks_per_row_of_elements // outputs_per_row
            if ticks_per_mux_output == 1:
                ticks_per_mux_counter = DefineCoreirConst(1, 0)()
            else:
                ticks_per_mux_counter = SizedCounterModM(ticks_per_mux_output,
                                                         has_ce=True)
                wire(ce_with_delay, ticks_per_mux_counter.CE)

            # this counter completes a cycle once for every mux output
            if outputs_per_row == 1:
                mux_bank_selector_counter = DefineCoreirConst(1, 0)()
            elif ticks_per_mux_output == 1:
                mux_bank_selector_counter = SizedCounterModM(outputs_per_row,
                                                             has_ce=True)
                wire(ce_with_delay, mux_bank_selector_counter.CE)
            else:
                ticks_per_mux_output_const = DefineCoreirConst(
                    len(ticks_per_mux_counter.O), ticks_per_mux_output - 1)()

                mux_bank_selector_counter = SizedCounterModM(outputs_per_row,
                                                             has_ce=True)
                wire(
                    ce_with_delay &
                    (ticks_per_mux_counter.O == ticks_per_mux_output_const.O),
                    mux_bank_selector_counter.CE)

            ram_bank_selector = MuxAnyType(Array[cls.out_per_clock, t],
                                           k // cls.out_per_clock)
            for i in range(k):
                wire(
                    rams.RDATA[i], ram_bank_selector.data[
                        i // cls.out_per_clock][i % cls.out_per_clock])
                if add_debug_interface:
                    wire(rams.RDATA[i], cls.RDATA[i])
            wire(mux_bank_selector_counter.O, ram_bank_selector.sel)

            # if not delaying,
            # remove latency of RAMs of by emitting first input on first clock immediately
            if initial_emitting_delay == 0:
                first_input_or_rams = MuxAnyType(Array[cls.out_per_clock, t],
                                                 2)
                wire(cls.I[0:cls.out_per_clock], first_input_or_rams.data[1])
                wire(ram_bank_selector.out, first_input_or_rams.data[0])

                # emit input directly only on first clock
                # a counter that tracks the current clock in the total emitting period
                point_in_emitting_period = SizedCounterModM(
                    total_emitting_period, has_ce=True)
                zero_const = DefineCoreirConst(len(point_in_emitting_period.O),
                                               0)()

                # don't delay this if delaying output, as then don't want input, want whatever ram says.
                wire(cls.CE, point_in_emitting_period.CE)

                wire(point_in_emitting_period.O == zero_const.O,
                     first_input_or_rams.sel[0])

                wire(first_input_or_rams.out, cls.O)
            else:
                wire(ram_bank_selector.out, cls.O)

            # valid on first enabled clock where on new output of mux
            zero_const = DefineCoreirConst(len(ticks_per_mux_counter.O), 0)()

            wire(
                bit(ce_with_delay) & (ticks_per_mux_counter.O == zero_const.O),
                cls.valid)
예제 #11
0
        def definition(TSBankGenerator):
            flat_idx_width = getRAMAddrWidth(no * ni)
            # next element each time_per_element clock
            if time_per_element > 1:
                index_in_cur_element = SizedCounterModM(time_per_element,
                                                        has_ce=has_ce,
                                                        has_reset=has_reset)
                next_element = Decode(time_per_element - 1,
                                      index_in_cur_element.O.N)(
                                          index_in_cur_element.O)
            else:
                next_element = DefineCoreirConst(1, 1)()
            # each element of the SSeq is a separate vector lane
            first_lane_flat_idx = SizedCounterModM((no + io) * ni,
                                                   incr=ni,
                                                   has_ce=True,
                                                   has_reset=has_reset)()
            time_counter = SizedCounterModM(no + io,
                                            has_ce=True,
                                            has_reset=has_reset)
            wire(next_element.O, first_lane_flat_idx.CE)
            wire(next_element.O, time_counter.CE)
            if has_ce:
                wire(TSBankGenerator.CE, index_in_cur_element.CE)
            if has_reset:
                wire(TSBankGenerator.RESET, index_in_cur_element.RESET)
                wire(TSBankGenerator.RESET, first_lane_flat_idx.RESET)
                wire(TSBankGenerator.RESET, time_counter.RESET)

            lane_flat_idxs = [first_lane_flat_idx.O]

            # compute the current flat_idx for each lane
            for i in range(1, ni):
                cur_lane_flat_idx_adder = DefineAdd(flat_idx_width)()
                wire(cur_lane_flat_idx_adder.I0, first_lane_flat_idx.O)
                wire(cur_lane_flat_idx_adder.I1,
                     DefineCoreirConst(flat_idx_width, i * no)().O)

                lane_flat_idxs += [cur_lane_flat_idx_adder.O]

            lane_flat_div_lcms = []
            # conmpute flat_idx / lcm_dim for each lane
            for i in range(ni):
                cur_lane_lcm_div = DefineUDiv(flat_idx_width)()
                wire(cur_lane_lcm_div.I0, lane_flat_idxs[0].O)
                wire(cur_lane_lcm_div.I1,
                     DefineCoreirConst(lcm(no, ni), flat_idx_width)().O)

                lane_flat_div_lcms += [cur_lane_flat_idx_adder.O]

            # compute ((flat_idx % sseq_dim) + (flat_idx / lcm_dim)) % sseq_dim for each lane
            # note that s_ts == flat_idx % sseq_dim
            # only need to mod sseq_dim at end as that is same as also doing it flat_idx before addition
            for i in range(ni):
                pre_mod_add = DefineAdd(flat_idx_width)()
                wire(pre_mod_add.I0, lane_flat_idxs[i])
                wire(pre_mod_add.I1, lane_flat_div_lcms[i])

                bank_mod = DefineUMod(flat_idx_width)()
                wire(bank_mod.I0, pre_mod_add.O)
                wire(bank_mod.I0, DefineCoreirConst(flat_idx_width, ni)().O)

                wire(TSBankGenerator.bank[i],
                     bank_mod.O[0:TSBankGenerator.bank_width])

            # compute t for each lane addr
            for i in range(0, ni):
                wire(TSBankGenerator.addr[i],
                     time_counter.O[0:TSBankGenerator.addr_width])
예제 #12
0
        def definition(upsampleSequential):
            # the counter of the current element of output sequence, when hits 0, load the next input to upsample
            element_idx_counter = SizedCounterModM(n,
                                                   has_ce=True,
                                                   has_reset=has_reset)
            is_first_element = Decode(0, element_idx_counter.O.N)(
                element_idx_counter.O)
            if has_reset:
                wire(upsampleSequential.RESET, element_idx_counter.RESET)

            # enabled means run the circuit
            # do this when downstream is ready, so have something to communicate with,
            # and when in first element and upstream is valid or have data to repeat
            enabled = upsampleSequential.ready_down & \
                      ((is_first_element & upsampleSequential.valid_up) | (~is_first_element))
            # ready means can accept input when get valid from upstream
            # do this when in first element and downstream ready to accept
            ready = is_first_element & upsampleSequential.ready_down
            # valid means can emit downstream
            # valid when in first element and upstream valid or repeating old data
            valid = (is_first_element
                     & upsampleSequential.valid_up) | (~is_first_element)

            # only assert these signals when CE is high or no CE
            if has_ce:
                enabled = enabled & bit(upsampleSequential.CE)
                ready = ready & bit(upsampleSequential.CE)
                valid = valid & bit(upsampleSequential.CE)

            if time_per_element > 1:
                value_store = DefineRAMAnyType(T, time_per_element)()
                value_store_input = value_store.WDATA
                value_store_output = value_store.RDATA

                time_per_element_counter = SizedCounterModM(
                    time_per_element, has_ce=True, has_reset=has_reset)
                go_to_next_element = Decode(time_per_element - 1,
                                            time_per_element_counter.O.N)(
                                                time_per_element_counter.O)

                wire(time_per_element_counter.CE, enabled)
                wire(element_idx_counter.CE, enabled & go_to_next_element)
                wire(value_store.WE, is_first_element & enabled)
                # location in current element is where to read and write.
                # will write on first iteration through element, read on later iterations
                wire(time_per_element_counter.O, value_store.WADDR)
                wire(time_per_element_counter.O, value_store.RADDR)

                if has_reset:
                    wire(time_per_element_counter.RESET,
                         upsampleSequential.RESET)

            else:
                value_store = DefineRegisterAnyType(T, has_ce=True)()
                value_store_input = value_store.I
                value_store_output = value_store.O

                wire(element_idx_counter.CE, enabled)
                wire(value_store.CE, is_first_element & enabled)

            output_selector = DefineMuxAnyType(T, 2)()

            wire(upsampleSequential.I, value_store_input)

            # on first element, send the input directly out. otherwise, use the register
            wire(is_first_element, output_selector.sel[0])
            wire(value_store_output, output_selector.data[0])
            wire(upsampleSequential.I, output_selector.data[1])
            wire(output_selector.out, upsampleSequential.O)

            wire(valid, upsampleSequential.valid_down)
            wire(ready, upsampleSequential.ready_up)
예제 #13
0
        def definition(serializer):
            # the counter of the current element of output sequence, when hits 0, load the next input to serialize
            element_idx_counter = SizedCounterModM(n,
                                                   has_ce=True,
                                                   has_reset=has_reset)
            is_first_element = Decode(0, element_idx_counter.O.N)(
                element_idx_counter.O)

            # enabled means run the circuit
            # do this when downstream is ready, so have something to communicate with,
            # and when in first element and upstream is valid or have serialized data to emit
            enabled = serializer.ready_down & \
                      ((is_first_element & serializer.valid_up) | (~is_first_element))
            # ready means can accept input when get valid from upstream
            # do this when in first element and downstream ready to accept
            ready = is_first_element & serializer.ready_down
            # valid means can emit downstream
            # valid when in first element and upstream valid or have serialized data to emit
            valid = (is_first_element
                     & serializer.valid_up) | (~is_first_element)
            if has_ce:
                enabled = enabled & bit(serializer.CE)
                ready = ready & bit(serializer.CE)
                valid = valid & bit(serializer.CE)

            if has_reset:
                wire(serializer.RESET, element_idx_counter.RESET)

            # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks
            if time_per_element > 1:
                value_store = DefineNativeMapParallel(
                    n, DefineRAMAnyType(T, time_per_element))()
                value_store_input = value_store.WDATA
                value_store_output = value_store.RDATA

                time_per_element_counter = SizedCounterModM(
                    time_per_element, has_ce=True, has_reset=has_reset)
                go_to_next_element = Decode(time_per_element - 1,
                                            time_per_element_counter.O.N)(
                                                time_per_element_counter.O)

                wire(time_per_element_counter.CE, enabled)
                wire(element_idx_counter.CE, enabled & go_to_next_element)
                for input_idx in range(n):
                    wire(value_store.WE[input_idx], is_first_element & enabled)
                    # location in current element is where to read and write.
                    # will write on first iteration through element, read on later iterations
                    wire(time_per_element_counter.O,
                         value_store.WADDR[input_idx])
                    wire(time_per_element_counter.O,
                         value_store.RADDR[input_idx])

                if has_reset:
                    wire(time_per_element_counter.RESET, serializer.RESET)

            else:
                value_store = DefineNativeMapParallel(
                    n, DefineRegisterAnyType(T, has_ce=True))()
                value_store_input = value_store.I
                value_store_output = value_store.O

                wire(element_idx_counter.CE, enabled)
                for input_idx in range(n):
                    wire(value_store.CE[input_idx], is_first_element & enabled)

            wire(serializer.I, value_store_input)

            # to serialize, go through all different rams/registers in value store
            # and select the output from the ith one, where i is current output element
            value_store_output_selector = DefineMuxAnyType(T, n)()
            wire(value_store_output, value_store_output_selector.data)
            wire(element_idx_counter.O, value_store_output_selector.sel)

            # on first element, send the input directly out. otherwise, use the register
            first_element_output_selector = DefineMuxAnyType(T, 2)()
            wire(is_first_element, first_element_output_selector.sel[0])
            wire(value_store_output_selector,
                 first_element_output_selector.data[0])
            wire(serializer.I[0], first_element_output_selector.data[1])
            wire(first_element_output_selector.out, serializer.O)

            wire(valid, serializer.valid_down)
            wire(ready, serializer.ready_up)
예제 #14
0
        def definition(deserializer):
            # the counter of the current element of output sequence, when hits n-1, output sload the next input to serialize
            element_idx_counter = SizedCounterModM(n,
                                                   has_ce=True,
                                                   has_reset=has_reset)
            is_last_element = Decode(n - 1, element_idx_counter.O.N)(
                element_idx_counter.O)

            # enabled means run the circuit
            # do this when upstream is ready, so have something to serialize,
            # and when have to emit serialized array and downstream is ready or don't have to emit current element
            enabled = deserializer.valid_up & \
                      ((is_last_element & deserializer.ready_down) | (~is_last_element))
            # ready means can accept input when get valid from upstream
            # ready when emitting serialized array and downstream is ready or don't have to emit current element
            ready = (is_last_element
                     & deserializer.ready_down) | (~is_last_element)
            # valid means can emit downstream
            # valid when emitting serialized array and upstream is providing valid input for element
            valid = is_last_element & deserializer.valid_up
            if has_ce:
                enabled = enabled & bit(deserializer.CE)
                ready = ready & bit(deserializer.CE)
                valid = valid & bit(deserializer.CE)

            if has_reset:
                wire(deserializer.RESET, element_idx_counter.RESET)

            # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks
            if time_per_element > 1:
                # only use n-1 value store, just wire nth input directly to output since outputting whole
                # deserialized sequence on period receiving nth input
                value_store = DefineNativeMapParallel(
                    n - 1, DefineRAMAnyType(T, time_per_element))()
                value_store_input = value_store.WDATA
                value_store_output = value_store.RDATA
                value_store_enables = value_store.WE

                time_per_element_counter = SizedCounterModM(
                    time_per_element, has_ce=True, has_reset=has_reset)
                go_to_next_element = Decode(time_per_element - 1,
                                            time_per_element_counter.O.N)(
                                                time_per_element_counter.O)

                wire(time_per_element_counter.CE, enabled)
                wire(element_idx_counter.CE, enabled & go_to_next_element)
                for input_idx in range(n - 1):
                    # location in current element is where to read and write.
                    # will write on first iteration through each element, read on last iteration from all elements
                    wire(time_per_element_counter.O,
                         value_store.WADDR[input_idx])
                    wire(time_per_element_counter.O,
                         value_store.RADDR[input_idx])

                if has_reset:
                    wire(time_per_element_counter.RESET, deserializer.RESET)

            else:
                value_store = DefineNativeMapParallel(
                    n - 1, DefineRegisterAnyType(T, has_ce=True))()
                value_store_input = value_store.I
                value_store_output = value_store.O
                value_store_enables = value_store.CE
                wire(element_idx_counter.CE, enabled)

            for element_idx in range(n - 1):
                # send input to all value stores
                # the enables will ensure only the right store each period reads in the value
                wire(deserializer.I, value_store_input[element_idx])
                # to deserialize, enable the ith rams/registers in value store
                # for ith element input
                idx_match_cur_element = Decode(element_idx,
                                               element_idx_counter.O.N)(
                                                   element_idx_counter.O)
                wire(enabled & idx_match_cur_element,
                     value_store_enables[element_idx])
                wire(value_store_output[element_idx],
                     deserializer.O[element_idx])

            # send the last input directly out
            wire(deserializer.I, deserializer.O[n - 1])

            wire(valid, deserializer.valid_down)
            wire(ready, deserializer.ready_up)
예제 #15
0
        def definition(cls):

            shift_register = MapParallel(
                pixel_per_clock,
                SIPOAnyType(image_size // pixel_per_clock,
                            pixel_type,
                            0,
                            has_ce=True))

            # reverse the pixels per clock. Since greater index_in_shift_register
            # mean earlier inputted pixels, also want greater current_shift_register
            # to mean earlier inputted pixels. This accomplishes that by making
            # pixels earlier each clock go to higher number shift register
            if first_row:
                wire(cls.I[::-1], shift_register.I)
            else:
                # don't need to reverse if not first row as prior rows have already done reversing
                wire(cls.I, shift_register.I)

            for i in range(pixel_per_clock):
                wire(cls.CE, shift_register.CE[i])

            # these two variables provide a 2D coordinate system for the SIPOs.
            # the inner dimension is current_shift_register
            # the outer dimension is index_in_shift_register
            # greater values in current_shift_register are inputs from older clocks
            # greater values in index_in_shift_register are inputs from lower index
            # values in the inputs in a single clock (due to above cls.I, type_to_bits reversing)
            # the index_in_shift_register is reversed so that bigger number always
            # means lower indexed value in the input image. For example, if asking
            # for location 0 with a 2 px per clock, 3 window width, then the
            # 2D location is index_in_shift_register = 1, current_shift_register = 1
            # and walking back in 2D space as increasing 1D location.
            current_shift_register = 0
            index_in_shift_register = 0

            # since current_shift_register and index_in_shift_register form a
            # 2D shape where current_shift_registers is inner dimension and
            # index_in_shift_register is outer, get_shift_register_location_in_1D_coordinates
            # and set_shift_register_location_using_1D_coordinates  convert between
            # 2D coordinates in the SIPOs and 1D coordinates in the 1D image

            # To do the reversing of 1D coordinates, need to find the oldest pixel that should be output,
            # ignoring origin as origin doesn't impact this computation.
            # This is done by finding the number of relevant pixels for outputting and adjusting it
            # so that it aligns with the number of pixels per clock cycle.
            # That coordinates position is treated as a 0 in the reverse coordinates
            # and requested coordinates (going in the opposite direction) are reversed
            # and adjusted to fit the new coordinate system by subtracting their values
            # from the 0's value in the original, forward coordinate system.

            # need to be able to handle situations with swizzling. Swizzling is
            # where a pixel inputted this clock is not used until next clock.
            # This is handled by wiring up in reverse order. If a pixel is inputted
            # in a clock but not used, it will have a high 1D location as it will be
            # one of the first registers in the first index_in_shift_register.
            # The swizzled pixel's large 1D location ensures it isn't wired directly
            # to an output

            # get needed pixels (ignoring origin as that can be garbage)
            # to determine number of clock cycles needed to satisfy input
            if cls.windows_per_active_clock == 1:
                needed_pixels = window_width
            else:
                needed_pixels = window_width + stride * (
                    cls.windows_per_active_clock - 1)

            # get the maximum 1D coordinate when aligning needed pixels to the number
            # of pixels per clock
            if needed_pixels % pixel_per_clock == 0:
                oldest_needed_pixel_forward_1D_coordinates = needed_pixels
            else:
                oldest_needed_pixel_forward_1D_coordinates = ceil(needed_pixels / pixel_per_clock) * \
                                                             pixel_per_clock

            # adjust by 1 for 0 indexing
            oldest_needed_pixel_forward_1D_coordinates -= 1

            def get_shift_register_location_in_1D_coordinates() -> int:
                return oldest_needed_pixel_forward_1D_coordinates - \
                       (index_in_shift_register * pixel_per_clock +
                        current_shift_register)

            def set_shift_register_location_using_1D_coordinates(
                    location: int) -> int:
                nonlocal current_shift_register, index_in_shift_register
                location_reversed_indexing = oldest_needed_pixel_forward_1D_coordinates - location
                index_in_shift_register = location_reversed_indexing // pixel_per_clock
                current_shift_register = location_reversed_indexing % pixel_per_clock

            used_coordinates = set()

            for current_window_index in range(cls.windows_per_active_clock):
                # stride is handled by wiring if there are multiple windows emitted per clock,
                # aka if stride is less than number of pixels per clock.
                # In this case, multiple windows are emitted but they must be overlapped
                # less than normal
                strideMultiplier = stride if stride < pixel_per_clock else 1
                set_shift_register_location_using_1D_coordinates(
                    strideMultiplier * current_window_index +
                    # handle origin across multiple clocks by changing valid, but within a single clock
                    # need to adjust where the windows start
                    # need neg conversion twice due to issues taking mod of negative number
                    ((origin * -1) % pixel_per_clock * -1))
                for index_in_window in range(window_width):
                    wire(
                        shift_register.O[current_shift_register]
                        [index_in_shift_register],
                        cls.O[current_window_index][index_in_window])

                    used_coordinates.add(
                        (index_in_shift_register, current_shift_register))

                    set_shift_register_location_using_1D_coordinates(
                        get_shift_register_location_in_1D_coordinates() + 1)

            # if not last row, have output ports for ends of all shift_registers so next
            # 1D can accept them
            if not last_row:
                index_in_shift_register = image_size // pixel_per_clock - 1
                for current_shift_register in range(pixel_per_clock):
                    wire(
                        shift_register.O[current_shift_register]
                        [index_in_shift_register],
                        cls.next_row[current_shift_register])
                    used_coordinates.add(
                        (index_in_shift_register, current_shift_register))

            # wire up all non-used coordinates to terms
            for sr in range(pixel_per_clock):
                for sr_index in range(image_size // pixel_per_clock):
                    if (sr_index, sr) in used_coordinates:
                        continue
                    term = TermAnyType(pixel_type)
                    wire(shift_register.O[sr][sr_index], term.I)

            # valid when the maximum coordinate used (minus origin, as origin can in
            # invalid space when emitting) gets data
            # add 1 here as coordinates are 0 indexed, and the denominator of this
            # fraction is the last register accessed
            # would add 1 outside fraction as it takes 1 clock for data
            # to get through registers but won't as 0 indexed
            valid_counter_max_value = ceil(
                (oldest_needed_pixel_forward_1D_coordinates + 1 + origin) /
                pixel_per_clock)

            # add 1 as sizedcounter counts to 1 less than the provided max
            valid_counter = SizedCounterModM(valid_counter_max_value + 1,
                                             has_ce=True)

            valid_counter_max_instance = DefineCoreirConst(
                len(valid_counter.O), valid_counter_max_value)()

            wire(
                enable(
                    bit(cls.CE)
                    & (valid_counter.O < valid_counter_max_instance.O)),
                valid_counter.CE)

            # if stride is greater than pixels_per_clock, then need a stride counter as
            # not active every clock. Invalid clocks create striding in this case
            if stride > pixel_per_clock:

                stride_counter = SizedCounterModM(stride // pixel_per_clock,
                                                  has_ce=True)
                stride_counter_0 = DefineCoreirConst(len(stride_counter.O),
                                                     0)()

                wire(
                    enable((stride_counter.O == stride_counter_0.O) &
                           (valid_counter.O == valid_counter_max_instance.O)),
                    cls.valid)

                # only increment stride if trying to emit data this clock cycle
                wire(valid_counter.O == valid_counter_max_instance.O,
                     stride_counter.CE)

            else:
                wire((valid_counter.O == valid_counter_max_instance.O),
                     cls.valid)
예제 #16
0
        def definition(STBankGenerator):
            flat_idx_width = getRAMAddrWidth(no * ni)
            # next element each time_per_element clock
            if time_per_element > 1:
                index_in_cur_element = SizedCounterModM(time_per_element,
                                                        has_ce=has_ce,
                                                        has_reset=has_reset)
                next_element = Decode(time_per_element - 1,
                                      index_in_cur_element.O.N)(
                                          index_in_cur_element.O)
            else:
                next_element = DefineCoreirConst(1, 1)()
            # each element of the SSeq is a separate vector lane
            first_lane_flat_idx = DefineCounterModM(ni + ii,
                                                    flat_idx_width,
                                                    cout=False,
                                                    has_ce=True,
                                                    has_reset=has_reset)()
            wire(next_element.O[0], first_lane_flat_idx.CE)
            if has_ce:
                wire(STBankGenerator.CE, index_in_cur_element.CE)
            if has_reset:
                wire(STBankGenerator.RESET, index_in_cur_element.RESET)
                wire(STBankGenerator.RESET, first_lane_flat_idx.RESET)

            lane_flat_idxs = [first_lane_flat_idx.O]

            # compute the current flat_idx for each lane
            for i in range(1, no):
                cur_lane_flat_idx_adder = DefineAdd(flat_idx_width)()
                wire(cur_lane_flat_idx_adder.I0, first_lane_flat_idx.O)
                wire(cur_lane_flat_idx_adder.I1,
                     DefineCoreirConst(flat_idx_width, i * ni)().O)

                lane_flat_idxs += [cur_lane_flat_idx_adder.O]

            lane_flat_div_lcms = []
            lcm_dim = DefineCoreirConst(flat_idx_width, lcm(no, ni))()
            # conmpute flat_idx / lcm_dim for each lane
            for i in range(no):
                cur_lane_lcm_div = DefineUDiv(flat_idx_width)()
                wire(cur_lane_lcm_div.I0, lane_flat_idxs[i])
                wire(cur_lane_lcm_div.I1, lcm_dim.O)

                lane_flat_div_lcms += [cur_lane_lcm_div.O]

            # compute ((flat_idx % sseq_dim) + (flat_idx / lcm_dim)) % sseq_dim for each lane
            # only need to mod sseq_dim at end as that is same as also doing it flat_idx before addition
            for i in range(no):
                pre_mod_add = DefineAdd(flat_idx_width)()
                wire(pre_mod_add.I0, lane_flat_idxs[i])
                wire(pre_mod_add.I1, lane_flat_div_lcms[i])

                bank_mod = DefineUMod(flat_idx_width)()
                wire(bank_mod.I0, pre_mod_add.O)
                wire(bank_mod.I1, DefineCoreirConst(flat_idx_width, no)().O)

                wire(STBankGenerator.bank[i],
                     bank_mod.O[0:STBankGenerator.bank_width])
                if len(bank_mod.O) > STBankGenerator.bank_width:
                    bits_to_term = len(bank_mod.O) - STBankGenerator.bank_width
                    term = TermAnyType(Array[bits_to_term, Bit])
                    wire(bank_mod.O[STBankGenerator.bank_width:], term.I)

            # compute flat_idx / sseq_dim for each lane addr
            for i in range(no):
                flat_idx_sseq_dim_div = DefineUDiv(flat_idx_width)()
                wire(flat_idx_sseq_dim_div.I0, lane_flat_idxs[0])
                wire(flat_idx_sseq_dim_div.I1,
                     DefineCoreirConst(flat_idx_width, no)().O)

                wire(STBankGenerator.addr[i],
                     flat_idx_sseq_dim_div.O[0:STBankGenerator.addr_width])
                if len(flat_idx_sseq_dim_div.O) > STBankGenerator.addr_width:
                    bits_to_term = len(bank_mod.O) - STBankGenerator.addr_width
                    term = TermAnyType(Array[bits_to_term, Bit])
                    wire(flat_idx_sseq_dim_div.O[STBankGenerator.addr_width:],
                         term.I)
예제 #17
0
        def definition(cls):
            # the counter of the current element of output sequence, when hits 0, load the next input to serialize
            element_idx_counter = SizedCounterModM(n + i_,
                                                   has_ce=True,
                                                   has_reset=has_reset)
            if element_idx_counter.O.N == math.ceil(math.log(n, 2)):
                element_idx_out = element_idx_counter.O
            else:
                used_bits_length = (math.ceil(math.log(n, 2)))
                unused_bits_length = element_idx_counter.O.N - used_bits_length
                element_idx_out = element_idx_counter.O[:used_bits_length]
                term = DefineTermAnyType(Array[unused_bits_length, Bit])()
                wire(element_idx_counter.O[used_bits_length:], term.I)
            is_first_element = Decode(0, element_idx_out.N)(element_idx_out)

            enabled = cls.valid_up
            valid_reg = DefineRegister(1)()
            wire(cls.valid_up, valid_reg.I[0])
            wire(valid_reg.O[0], cls.valid_down)

            # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks
            if is_nested(T) and T.time() > 1:
                value_store = [
                    DefineRAMAnyType(T.magma_repr(), T.time())()
                    for _ in range(n - 1)
                ]
                value_store_input = [ram.WDATA for ram in value_store]
                value_store_output = [ram.RDATA for ram in value_store]

                time_per_element_counter = SizedCounterModM(
                    T.time(), has_ce=True, has_reset=has_reset)
                go_to_next_element = Decode(T.time() - 1,
                                            time_per_element_counter.O.N)(
                                                time_per_element_counter.O)

                wire(time_per_element_counter.CE, enabled)
                wire(element_idx_counter.CE, enabled & go_to_next_element)
                for input_idx in range(n - 1):
                    wire(value_store[input_idx].WE, is_first_element & enabled)
                    # location in current element is where to read and write.
                    # will write on first iteration through element, read on later iterations
                    wire(time_per_element_counter.O,
                         value_store[input_idx].WADDR)
                    wire(time_per_element_counter.O,
                         value_store[input_idx].RADDR)

                if has_reset:
                    wire(time_per_element_counter.RESET, cls.RESET)

            else:
                value_store = [
                    DefineRegisterAnyType(T.magma_repr(), has_ce=True)()
                    for _ in range(n - 1)
                ]
                value_store_input = [reg.I for reg in value_store]
                value_store_output = [reg.O for reg in value_store]

                wire(element_idx_counter.CE, enabled)
                for input_idx in range(n - 1):
                    wire(value_store[input_idx].CE, is_first_element & enabled)

            for i in range(n - 1):
                wire(cls.I[i + 1], value_store_input[i])

            # to serialize, go through all different rams/registers in value store
            # and select the output from the ith one, where i is current output element
            value_store_output_selector = DefineMuxAnyType(T.magma_repr(), n)()
            for i in range(n - 1):
                wire(value_store_output[i],
                     value_store_output_selector.data[i + 1])
            # just wiring this up to avoid any issues
            wire(value_store_output[0], value_store_output_selector.data[0])
            wire(element_idx_out, value_store_output_selector.sel)

            # on first element, send the input directly out. otherwise, use the register
            first_element_output_selector = DefineMuxAnyType(
                T.magma_repr(), 2)()
            wire(is_first_element, first_element_output_selector.sel[0])
            wire(value_store_output_selector.out,
                 first_element_output_selector.data[0])
            wire(cls.I[0], first_element_output_selector.data[1])
            out_reg = DefineRegisterAnyType(cls.st_out_t.magma_repr())()
            wire(first_element_output_selector.out, out_reg.I)
            wire(out_reg.O, cls.O)