def InputImageRAM(cirb, circuit, nextNodeInput, imgSrc, pxPerClock, parallelism = None): imgData = loadImage(imgSrc, pxPerClock) imgRAM = CoreirMem(cirb, imgData.numRows, imgData.bitsPerRow) # this counter ensures writing to correct address always writeCounter = SizedCounterModM(imgData.numRows, has_ce=True) # these counters ensure that emit pxPerClock for number of cycles that parallelism requries if parallelism is not None: readEnableCounter = SizedCounterModM(int(pxPerClock / parallelism), has_ce=True, cout=True) term = Term(cirb, len(readEnableCounter.O)) readCounter = SizedCounterModM(imgData.numRows, has_ce=True) wire(writeCounter.O, imgRAM.waddr) wire(circuit.input_wdata, imgRAM.wdata) wire(readCounter.O, imgRAM.raddr) connectArraysAndArraysofArrays(imgRAM.rdata, nextNodeInput, pxPerClock, imgData.bitsPerPixel) if parallelism is not None: wire(circuit.input_ren, readEnableCounter.CE) wire(readEnableCounter.COUT, readCounter.CE) wire(readEnableCounter.O, term.I) else: wire(circuit.input_ren, readCounter.CE) wire(circuit.input_wen, imgRAM.wen) wire(circuit.input_wen, writeCounter.CE) return imgRAM
def definition(fifo): pieces_of_elements = n * time_per_element if expand_max_by_one_clk: pieces_of_elements += 1 read_counter = SizedCounterModM(pieces_of_elements, has_ce=True, has_reset=has_reset) write_counter = SizedCounterModM(pieces_of_elements, has_ce=True, has_reset=has_reset) # add 1 as 0 doesn't mean on first element, means 0 element, so also need pieces_of_elements to be # entry if all written num_stored_counter = CeilFloorUpDownCounter(pieces_of_elements + 1, has_ce=has_ce, has_reset=has_reset) # ready means can accept input. # Do this when the num_stored_counter is not at it's max value ready = ~Decode(pieces_of_elements, num_stored_counter.O.N)( num_stored_counter.O) # valid means can emit downstream # Do this when the num_stored_counter shows not empty valid = ~Decode(0, num_stored_counter.O.N)(num_stored_counter.O) # only assert these signals when CE is high or no CE if has_ce: ready = ready & bit(fifo.CE) valid = valid & bit(fifo.CE) wire(num_stored_counter.CE, fifo.CE) read_this_clk = valid & fifo.ready_down write_this_clk = ready & fifo.valid_up wire(read_counter.CE, read_this_clk) wire(write_counter.CE, write_this_clk) wire(num_stored_counter.U, write_this_clk) wire(num_stored_counter.D, read_this_clk) if has_reset: wire(fifo.RESET, read_counter.RESET) wire(fifo.RESET, write_counter.RESET) wire(fifo.RESET, num_stored_counter.RESET) value_store = DefineRAMAnyType(T, pieces_of_elements)() wire(value_store.WADDR, write_counter.O) wire(value_store.RADDR, read_counter.O) wire(value_store.WDATA, fifo.I) wire(value_store.RDATA, fifo.O) wire(value_store.WE, write_this_clk) wire(valid, fifo.valid_down) wire(ready, fifo.ready_up)
def definition(downsampleSequential): element_idx_counter = SizedCounterModM(n, has_ce=True, has_reset=has_reset) emit_cur_element = Decode(idx, element_idx_counter.O.N)( element_idx_counter.O) # enabled means run the circuit # do this when upstream is ready, so have something to downsample, # and when have to emit current element and downstream is ready or don't have to emit current element enabled = downsampleSequential.valid_up & \ ((emit_cur_element & downsampleSequential.ready_down) | (~emit_cur_element)) # ready means can accept input when get valid from upstream # ready when emit current element and downstream is ready or don't have to emit current element ready = (emit_cur_element & downsampleSequential.ready_down) | (~emit_cur_element) # valid means can emit downstream # valid when emitting current element and upstream is providing valid input for element valid = emit_cur_element & downsampleSequential.valid_up if has_ce: enabled = enabled & bit(downsampleSequential.CE) ready = ready & bit(downsampleSequential.CE) valid = valid & bit(downsampleSequential.CE) if time_per_element > 1: time_per_element_counter = SizedCounterModM( time_per_element, has_ce=True, has_reset=has_reset) go_to_next_element = Decode(time_per_element - 1, time_per_element_counter.O.N)( time_per_element_counter.O) wire(time_per_element_counter.CE, enabled) wire(element_idx_counter.CE, enabled & go_to_next_element) if has_reset: wire(time_per_element_counter.RESET, downsampleSequential.RESET) wire(element_idx_counter.RESET, downsampleSequential.RESET) else: wire(element_idx_counter.CE, enabled) if has_reset: wire(element_idx_counter.RESET, downsampleSequential.RESET) wire(downsampleSequential.I, downsampleSequential.O) wire(valid, downsampleSequential.valid_down) wire(ready, downsampleSequential.ready_up)
def OutputImageRAM(cirb, circuit, prevNodeOutput, writeValidSignal, imgSrc, pxPerClock): imgData = loadImage(imgSrc, pxPerClock) imgRAM = CoreirMem(cirb, imgData.numRows, imgData.bitsPerRow) # this counter ensures writing to correct address always writeCounter = SizedCounterModM(imgData.numRows, has_ce=True) # this counter ensures reading from the right address readCounter = SizedCounterModM(imgData.numRows, has_ce=True) wire(writeCounter.O, imgRAM.waddr) connectArraysAndArraysofArrays(imgRAM.wdata, prevNodeOutput, pxPerClock, imgData.bitsPerPixel) wire(readCounter.O, imgRAM.raddr) wire(imgRAM.rdata, circuit.output_rdata) wire(circuit.output_ren, readCounter.CE) wire(writeValidSignal, imgRAM.wen) wire(writeValidSignal, writeCounter.CE) return imgRAM
def definition(cls): valid_counter = SizedCounterModM(num_clocks_delay + 1, has_ce=True, has_reset=has_reset) delay_const = DefineCoreirConst(len(valid_counter.O), num_clocks_delay)() if has_ce: wire(enable(bit(cls.CE) & (valid_counter.O < delay_const.O)), valid_counter.CE) else: wire(valid_counter.O < delay_const.O, valid_counter.CE) wire(valid_counter.O == delay_const.O, cls.valid) if has_reset: wire(cls.RESET, valid_counter.RESET)
def test_sized_counter_modm(): args = ['O', Out(Array[2, Bit])] + ClockInterface(False, False) testcircuit = DefineCircuit('sized_counter_modm_test', *args) counter = SizedCounterModM(3, has_ce=True) wire(1, counter.CE) wire(testcircuit.O, counter.O) EndCircuit() #save_CoreIR_json(cirb, testcircuit, "sized_counter_modm.json") sim = CoreIRSimulator(testcircuit, testcircuit.CLK, namespaces=[ "aetherlinglib", "commonlib", "mantle", "coreir", "global" ])
def definition(partition): dehydrate = MapParallel(arrayType.N, Dehydrate(partition.elementType)) # each mux emits 1 element of the subset that is emitted every clock # each mux needs to handle k/s inputs, so it can output one element every clock muxes = MapParallel(subsetSize, CommonlibMuxN(int(arrayType.N / subsetSize), len(dehydrate.out[0]))) hydrate = MapParallel(subsetSize, Hydrate(partition.elementType)) counter = SizedCounterModM(int(arrayType.N/subsetSize), has_ce=has_ce) wire(partition.I, dehydrate.I) for i in range(subsetSize): # to the first mux wire 0, subsetSize, 2*subsetSize,... # so that each clock it emits the first element of the next subset # repeat for each mux so ith mux outputs ith element of subset each clock wire(dehydrate.out[i::subsetSize], muxes.I[i].data) wire(counter.O, muxes.I[i].sel) wire(muxes.out, hydrate.I) wire(hydrate.out, partition.O) if has_ce: wire(partition.CE, counter.CE)
def test_multiple_sipo_and_counter(): args = ['I', In(Bit), 'O_sipo', Out(Array[4, Bit])] + [ 'O_counter', Out(Array[2, Bit]) ] + ClockInterface(False, False) testcircuit = DefineCircuit('multiple_sipo_and_counter_test', *args) map_sipo = MapParallel(1, SIPO(4, 0, has_ce=True)) wire(1, map_sipo.CE[0]) wire(testcircuit.I, map_sipo.I[0]) wire(testcircuit.O_sipo, map_sipo.O[0]) counter = SizedCounterModM(3, has_ce=True) wire(1, counter.CE) wire(testcircuit.O_counter, counter.O) EndCircuit() #save_CoreIR_json(cirb, testcircuit, "multiple_sipo_and_counter.json") sim = CoreIRSimulator(testcircuit, testcircuit.CLK, namespaces=[ "aetherlinglib", "commonlib", "mantle", "coreir", "global" ])
def definition(cls): lb = AnyDimensionalLineBuffer( pixel_type, [rows_of_pixels_per_clock, pixels_per_row_per_clock], [window_rows, window_cols], [image_rows, image_cols], [stride_rows, stride_cols], [origin_rows, origin_cols]) wire(cls.I, lb.I) if stride_rows <= rows_of_pixels_per_clock: for row_of_windows in range(cls.rows_of_windows_per_clock): for window_per_row in range(cls.windows_per_row_per_clock): wire( cls.O[row_of_windows * cls.windows_per_row_per_clock + window_per_row], lb.O[row_of_windows][window_per_row]) wire(cls.valid, lb.valid) else: if add_debug_interface: for row_of_windows in range(cls.rows_of_windows_per_clock): for window_per_row in range( cls.windows_per_row_per_clock): wire( cls.undelayedO[row_of_windows * cls.windows_per_row_per_clock + window_per_row], lb.O[row_of_windows][window_per_row]) db = DelayedBuffer(Array[window_rows, Array[window_cols, pixel_type]], image_cols // stride_cols, max(pixels_per_row_per_clock // stride_cols, 1), cls.time_per_buffered_cycle, add_debug_interface=add_debug_interface) for row_of_windows in range(cls.rows_of_windows_per_clock): for window_per_row in range(cls.windows_per_row_per_clock): wire( db.I[row_of_windows * cls.windows_per_row_per_clock + window_per_row], lb.O[row_of_windows][window_per_row]) wire(lb.valid, db.WE) wire(db.valid, cls.valid) wire(db.O, cls.O) # first time lb is valid, delayed buffer becomes # valid permanently first_valid_counter = SizedCounterModM(2, has_ce=True) zero_const = DefineCoreirConst(1, 0)() wire(lb.valid & (zero_const.O == first_valid_counter.O), first_valid_counter.CE) # delay the CE of the delayed buffer as the LB output will hit the # DB one later, so give the DB that CE # this ensure sthat when using CE for a ready-valid chain, don't have to wait until delayed_ce_for_db_valid = DefineRegisterAnyType(Bit)() wire(bit(cls.CE), delayed_ce_for_db_valid.I) #ce_or_last_valid = bit(cls.CE) | (lb.valid & ~last_clock_lb_valid.O) # need lb.valid or counter as lb.valid will be 1 on first clock where valid # while counter will still be 0 wire((lb.valid | first_valid_counter.O[0]) & delayed_ce_for_db_valid.O, db.CE) if add_debug_interface: wire((lb.valid | first_valid_counter.O[0]) & delayed_ce_for_db_valid.O, cls.dbCE) wire(lb.valid, cls.dbWE) wire(db.WDATA, cls.WDATA) wire(db.RDATA, cls.RDATA) wire(db.WADDR, cls.WADDR) wire(db.RADDR, cls.RADDR) wire(db.RAMWE, cls.RAMWE) wire(cls.CE, lb.CE) wire(cls.ready, 1)
def definition(cls): rams = DefineNativeMapParallel(k, DefineRAMAnyType(t, n // k))() # each clock WE is set, write to the RAMs and increment the address writing_location_per_bank = SizedCounterModM(n // k, has_ce=True) wire(cls.I, rams.WDATA) ramEnableWire = cls.WE & bit(cls.CE) if add_debug_interface: wire(cls.I, cls.WDATA) wire(ramEnableWire, cls.RAMWE) for i in range(k): wire(writing_location_per_bank.O, rams.WADDR[i]) if add_debug_interface: wire(writing_location_per_bank.O, cls.WADDR[i]) wire(ramEnableWire, rams.WE[i]) wire(cls.WE & bit(cls.CE), writing_location_per_bank.CE) if initial_emitting_delay > 0: initial_delay_counter = InitialDelayCounter( initial_emitting_delay) ce_with_delay = bit(cls.CE) & initial_delay_counter.valid else: ce_with_delay = bit(cls.CE) # the bank ram counter tracks which group of entries in all the banked rams RADDR should be set to ticks_per_element = total_emitting_period // (n // cls.out_per_clock) ticks_per_row_of_elements = ticks_per_element * (k // cls.out_per_clock) # this completes a cycle ever time the current_element_per_bank_ram increments by 1 if n // k == 1: current_element_per_banked_ram_counter = DefineCoreirConst( 1, 0)() elif ticks_per_row_of_elements == 1: current_element_per_banked_ram_counter = SizedCounterModM( n // k, has_ce=True) wire(ce_with_delay, current_element_per_banked_ram_counter.CE) else: bank_ram_tick_counter = SizedCounterModM( ticks_per_row_of_elements, has_ce=True) ticks_per_row_of_elements_const = DefineCoreirConst( len(bank_ram_tick_counter.O), ticks_per_row_of_elements - 1)() current_element_per_banked_ram_counter = SizedCounterModM( n // k, has_ce=True) wire(ce_with_delay, bank_ram_tick_counter.CE) wire( ce_with_delay & (bank_ram_tick_counter.O == ticks_per_row_of_elements_const.O), current_element_per_banked_ram_counter.CE) for i in range(k): wire(current_element_per_banked_ram_counter.O, rams.RADDR[i]) if add_debug_interface: wire(current_element_per_banked_ram_counter.O, cls.RADDR[i]) # the mux bank selector counter tracks which of the banks to read from right now # divide the number of ticks per row by the number of mux outputs per row # (k // cls.out_per_clock) to get ticks per mux output outputs_per_row = k // cls.out_per_clock ticks_per_mux_output = ticks_per_row_of_elements // outputs_per_row if ticks_per_mux_output == 1: ticks_per_mux_counter = DefineCoreirConst(1, 0)() else: ticks_per_mux_counter = SizedCounterModM(ticks_per_mux_output, has_ce=True) wire(ce_with_delay, ticks_per_mux_counter.CE) # this counter completes a cycle once for every mux output if outputs_per_row == 1: mux_bank_selector_counter = DefineCoreirConst(1, 0)() elif ticks_per_mux_output == 1: mux_bank_selector_counter = SizedCounterModM(outputs_per_row, has_ce=True) wire(ce_with_delay, mux_bank_selector_counter.CE) else: ticks_per_mux_output_const = DefineCoreirConst( len(ticks_per_mux_counter.O), ticks_per_mux_output - 1)() mux_bank_selector_counter = SizedCounterModM(outputs_per_row, has_ce=True) wire( ce_with_delay & (ticks_per_mux_counter.O == ticks_per_mux_output_const.O), mux_bank_selector_counter.CE) ram_bank_selector = MuxAnyType(Array[cls.out_per_clock, t], k // cls.out_per_clock) for i in range(k): wire( rams.RDATA[i], ram_bank_selector.data[ i // cls.out_per_clock][i % cls.out_per_clock]) if add_debug_interface: wire(rams.RDATA[i], cls.RDATA[i]) wire(mux_bank_selector_counter.O, ram_bank_selector.sel) # if not delaying, # remove latency of RAMs of by emitting first input on first clock immediately if initial_emitting_delay == 0: first_input_or_rams = MuxAnyType(Array[cls.out_per_clock, t], 2) wire(cls.I[0:cls.out_per_clock], first_input_or_rams.data[1]) wire(ram_bank_selector.out, first_input_or_rams.data[0]) # emit input directly only on first clock # a counter that tracks the current clock in the total emitting period point_in_emitting_period = SizedCounterModM( total_emitting_period, has_ce=True) zero_const = DefineCoreirConst(len(point_in_emitting_period.O), 0)() # don't delay this if delaying output, as then don't want input, want whatever ram says. wire(cls.CE, point_in_emitting_period.CE) wire(point_in_emitting_period.O == zero_const.O, first_input_or_rams.sel[0]) wire(first_input_or_rams.out, cls.O) else: wire(ram_bank_selector.out, cls.O) # valid on first enabled clock where on new output of mux zero_const = DefineCoreirConst(len(ticks_per_mux_counter.O), 0)() wire( bit(ce_with_delay) & (ticks_per_mux_counter.O == zero_const.O), cls.valid)
def definition(TSBankGenerator): flat_idx_width = getRAMAddrWidth(no * ni) # next element each time_per_element clock if time_per_element > 1: index_in_cur_element = SizedCounterModM(time_per_element, has_ce=has_ce, has_reset=has_reset) next_element = Decode(time_per_element - 1, index_in_cur_element.O.N)( index_in_cur_element.O) else: next_element = DefineCoreirConst(1, 1)() # each element of the SSeq is a separate vector lane first_lane_flat_idx = SizedCounterModM((no + io) * ni, incr=ni, has_ce=True, has_reset=has_reset)() time_counter = SizedCounterModM(no + io, has_ce=True, has_reset=has_reset) wire(next_element.O, first_lane_flat_idx.CE) wire(next_element.O, time_counter.CE) if has_ce: wire(TSBankGenerator.CE, index_in_cur_element.CE) if has_reset: wire(TSBankGenerator.RESET, index_in_cur_element.RESET) wire(TSBankGenerator.RESET, first_lane_flat_idx.RESET) wire(TSBankGenerator.RESET, time_counter.RESET) lane_flat_idxs = [first_lane_flat_idx.O] # compute the current flat_idx for each lane for i in range(1, ni): cur_lane_flat_idx_adder = DefineAdd(flat_idx_width)() wire(cur_lane_flat_idx_adder.I0, first_lane_flat_idx.O) wire(cur_lane_flat_idx_adder.I1, DefineCoreirConst(flat_idx_width, i * no)().O) lane_flat_idxs += [cur_lane_flat_idx_adder.O] lane_flat_div_lcms = [] # conmpute flat_idx / lcm_dim for each lane for i in range(ni): cur_lane_lcm_div = DefineUDiv(flat_idx_width)() wire(cur_lane_lcm_div.I0, lane_flat_idxs[0].O) wire(cur_lane_lcm_div.I1, DefineCoreirConst(lcm(no, ni), flat_idx_width)().O) lane_flat_div_lcms += [cur_lane_flat_idx_adder.O] # compute ((flat_idx % sseq_dim) + (flat_idx / lcm_dim)) % sseq_dim for each lane # note that s_ts == flat_idx % sseq_dim # only need to mod sseq_dim at end as that is same as also doing it flat_idx before addition for i in range(ni): pre_mod_add = DefineAdd(flat_idx_width)() wire(pre_mod_add.I0, lane_flat_idxs[i]) wire(pre_mod_add.I1, lane_flat_div_lcms[i]) bank_mod = DefineUMod(flat_idx_width)() wire(bank_mod.I0, pre_mod_add.O) wire(bank_mod.I0, DefineCoreirConst(flat_idx_width, ni)().O) wire(TSBankGenerator.bank[i], bank_mod.O[0:TSBankGenerator.bank_width]) # compute t for each lane addr for i in range(0, ni): wire(TSBankGenerator.addr[i], time_counter.O[0:TSBankGenerator.addr_width])
def definition(upsampleSequential): # the counter of the current element of output sequence, when hits 0, load the next input to upsample element_idx_counter = SizedCounterModM(n, has_ce=True, has_reset=has_reset) is_first_element = Decode(0, element_idx_counter.O.N)( element_idx_counter.O) if has_reset: wire(upsampleSequential.RESET, element_idx_counter.RESET) # enabled means run the circuit # do this when downstream is ready, so have something to communicate with, # and when in first element and upstream is valid or have data to repeat enabled = upsampleSequential.ready_down & \ ((is_first_element & upsampleSequential.valid_up) | (~is_first_element)) # ready means can accept input when get valid from upstream # do this when in first element and downstream ready to accept ready = is_first_element & upsampleSequential.ready_down # valid means can emit downstream # valid when in first element and upstream valid or repeating old data valid = (is_first_element & upsampleSequential.valid_up) | (~is_first_element) # only assert these signals when CE is high or no CE if has_ce: enabled = enabled & bit(upsampleSequential.CE) ready = ready & bit(upsampleSequential.CE) valid = valid & bit(upsampleSequential.CE) if time_per_element > 1: value_store = DefineRAMAnyType(T, time_per_element)() value_store_input = value_store.WDATA value_store_output = value_store.RDATA time_per_element_counter = SizedCounterModM( time_per_element, has_ce=True, has_reset=has_reset) go_to_next_element = Decode(time_per_element - 1, time_per_element_counter.O.N)( time_per_element_counter.O) wire(time_per_element_counter.CE, enabled) wire(element_idx_counter.CE, enabled & go_to_next_element) wire(value_store.WE, is_first_element & enabled) # location in current element is where to read and write. # will write on first iteration through element, read on later iterations wire(time_per_element_counter.O, value_store.WADDR) wire(time_per_element_counter.O, value_store.RADDR) if has_reset: wire(time_per_element_counter.RESET, upsampleSequential.RESET) else: value_store = DefineRegisterAnyType(T, has_ce=True)() value_store_input = value_store.I value_store_output = value_store.O wire(element_idx_counter.CE, enabled) wire(value_store.CE, is_first_element & enabled) output_selector = DefineMuxAnyType(T, 2)() wire(upsampleSequential.I, value_store_input) # on first element, send the input directly out. otherwise, use the register wire(is_first_element, output_selector.sel[0]) wire(value_store_output, output_selector.data[0]) wire(upsampleSequential.I, output_selector.data[1]) wire(output_selector.out, upsampleSequential.O) wire(valid, upsampleSequential.valid_down) wire(ready, upsampleSequential.ready_up)
def definition(serializer): # the counter of the current element of output sequence, when hits 0, load the next input to serialize element_idx_counter = SizedCounterModM(n, has_ce=True, has_reset=has_reset) is_first_element = Decode(0, element_idx_counter.O.N)( element_idx_counter.O) # enabled means run the circuit # do this when downstream is ready, so have something to communicate with, # and when in first element and upstream is valid or have serialized data to emit enabled = serializer.ready_down & \ ((is_first_element & serializer.valid_up) | (~is_first_element)) # ready means can accept input when get valid from upstream # do this when in first element and downstream ready to accept ready = is_first_element & serializer.ready_down # valid means can emit downstream # valid when in first element and upstream valid or have serialized data to emit valid = (is_first_element & serializer.valid_up) | (~is_first_element) if has_ce: enabled = enabled & bit(serializer.CE) ready = ready & bit(serializer.CE) valid = valid & bit(serializer.CE) if has_reset: wire(serializer.RESET, element_idx_counter.RESET) # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks if time_per_element > 1: value_store = DefineNativeMapParallel( n, DefineRAMAnyType(T, time_per_element))() value_store_input = value_store.WDATA value_store_output = value_store.RDATA time_per_element_counter = SizedCounterModM( time_per_element, has_ce=True, has_reset=has_reset) go_to_next_element = Decode(time_per_element - 1, time_per_element_counter.O.N)( time_per_element_counter.O) wire(time_per_element_counter.CE, enabled) wire(element_idx_counter.CE, enabled & go_to_next_element) for input_idx in range(n): wire(value_store.WE[input_idx], is_first_element & enabled) # location in current element is where to read and write. # will write on first iteration through element, read on later iterations wire(time_per_element_counter.O, value_store.WADDR[input_idx]) wire(time_per_element_counter.O, value_store.RADDR[input_idx]) if has_reset: wire(time_per_element_counter.RESET, serializer.RESET) else: value_store = DefineNativeMapParallel( n, DefineRegisterAnyType(T, has_ce=True))() value_store_input = value_store.I value_store_output = value_store.O wire(element_idx_counter.CE, enabled) for input_idx in range(n): wire(value_store.CE[input_idx], is_first_element & enabled) wire(serializer.I, value_store_input) # to serialize, go through all different rams/registers in value store # and select the output from the ith one, where i is current output element value_store_output_selector = DefineMuxAnyType(T, n)() wire(value_store_output, value_store_output_selector.data) wire(element_idx_counter.O, value_store_output_selector.sel) # on first element, send the input directly out. otherwise, use the register first_element_output_selector = DefineMuxAnyType(T, 2)() wire(is_first_element, first_element_output_selector.sel[0]) wire(value_store_output_selector, first_element_output_selector.data[0]) wire(serializer.I[0], first_element_output_selector.data[1]) wire(first_element_output_selector.out, serializer.O) wire(valid, serializer.valid_down) wire(ready, serializer.ready_up)
def definition(deserializer): # the counter of the current element of output sequence, when hits n-1, output sload the next input to serialize element_idx_counter = SizedCounterModM(n, has_ce=True, has_reset=has_reset) is_last_element = Decode(n - 1, element_idx_counter.O.N)( element_idx_counter.O) # enabled means run the circuit # do this when upstream is ready, so have something to serialize, # and when have to emit serialized array and downstream is ready or don't have to emit current element enabled = deserializer.valid_up & \ ((is_last_element & deserializer.ready_down) | (~is_last_element)) # ready means can accept input when get valid from upstream # ready when emitting serialized array and downstream is ready or don't have to emit current element ready = (is_last_element & deserializer.ready_down) | (~is_last_element) # valid means can emit downstream # valid when emitting serialized array and upstream is providing valid input for element valid = is_last_element & deserializer.valid_up if has_ce: enabled = enabled & bit(deserializer.CE) ready = ready & bit(deserializer.CE) valid = valid & bit(deserializer.CE) if has_reset: wire(deserializer.RESET, element_idx_counter.RESET) # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks if time_per_element > 1: # only use n-1 value store, just wire nth input directly to output since outputting whole # deserialized sequence on period receiving nth input value_store = DefineNativeMapParallel( n - 1, DefineRAMAnyType(T, time_per_element))() value_store_input = value_store.WDATA value_store_output = value_store.RDATA value_store_enables = value_store.WE time_per_element_counter = SizedCounterModM( time_per_element, has_ce=True, has_reset=has_reset) go_to_next_element = Decode(time_per_element - 1, time_per_element_counter.O.N)( time_per_element_counter.O) wire(time_per_element_counter.CE, enabled) wire(element_idx_counter.CE, enabled & go_to_next_element) for input_idx in range(n - 1): # location in current element is where to read and write. # will write on first iteration through each element, read on last iteration from all elements wire(time_per_element_counter.O, value_store.WADDR[input_idx]) wire(time_per_element_counter.O, value_store.RADDR[input_idx]) if has_reset: wire(time_per_element_counter.RESET, deserializer.RESET) else: value_store = DefineNativeMapParallel( n - 1, DefineRegisterAnyType(T, has_ce=True))() value_store_input = value_store.I value_store_output = value_store.O value_store_enables = value_store.CE wire(element_idx_counter.CE, enabled) for element_idx in range(n - 1): # send input to all value stores # the enables will ensure only the right store each period reads in the value wire(deserializer.I, value_store_input[element_idx]) # to deserialize, enable the ith rams/registers in value store # for ith element input idx_match_cur_element = Decode(element_idx, element_idx_counter.O.N)( element_idx_counter.O) wire(enabled & idx_match_cur_element, value_store_enables[element_idx]) wire(value_store_output[element_idx], deserializer.O[element_idx]) # send the last input directly out wire(deserializer.I, deserializer.O[n - 1]) wire(valid, deserializer.valid_down) wire(ready, deserializer.ready_up)
def definition(cls): shift_register = MapParallel( pixel_per_clock, SIPOAnyType(image_size // pixel_per_clock, pixel_type, 0, has_ce=True)) # reverse the pixels per clock. Since greater index_in_shift_register # mean earlier inputted pixels, also want greater current_shift_register # to mean earlier inputted pixels. This accomplishes that by making # pixels earlier each clock go to higher number shift register if first_row: wire(cls.I[::-1], shift_register.I) else: # don't need to reverse if not first row as prior rows have already done reversing wire(cls.I, shift_register.I) for i in range(pixel_per_clock): wire(cls.CE, shift_register.CE[i]) # these two variables provide a 2D coordinate system for the SIPOs. # the inner dimension is current_shift_register # the outer dimension is index_in_shift_register # greater values in current_shift_register are inputs from older clocks # greater values in index_in_shift_register are inputs from lower index # values in the inputs in a single clock (due to above cls.I, type_to_bits reversing) # the index_in_shift_register is reversed so that bigger number always # means lower indexed value in the input image. For example, if asking # for location 0 with a 2 px per clock, 3 window width, then the # 2D location is index_in_shift_register = 1, current_shift_register = 1 # and walking back in 2D space as increasing 1D location. current_shift_register = 0 index_in_shift_register = 0 # since current_shift_register and index_in_shift_register form a # 2D shape where current_shift_registers is inner dimension and # index_in_shift_register is outer, get_shift_register_location_in_1D_coordinates # and set_shift_register_location_using_1D_coordinates convert between # 2D coordinates in the SIPOs and 1D coordinates in the 1D image # To do the reversing of 1D coordinates, need to find the oldest pixel that should be output, # ignoring origin as origin doesn't impact this computation. # This is done by finding the number of relevant pixels for outputting and adjusting it # so that it aligns with the number of pixels per clock cycle. # That coordinates position is treated as a 0 in the reverse coordinates # and requested coordinates (going in the opposite direction) are reversed # and adjusted to fit the new coordinate system by subtracting their values # from the 0's value in the original, forward coordinate system. # need to be able to handle situations with swizzling. Swizzling is # where a pixel inputted this clock is not used until next clock. # This is handled by wiring up in reverse order. If a pixel is inputted # in a clock but not used, it will have a high 1D location as it will be # one of the first registers in the first index_in_shift_register. # The swizzled pixel's large 1D location ensures it isn't wired directly # to an output # get needed pixels (ignoring origin as that can be garbage) # to determine number of clock cycles needed to satisfy input if cls.windows_per_active_clock == 1: needed_pixels = window_width else: needed_pixels = window_width + stride * ( cls.windows_per_active_clock - 1) # get the maximum 1D coordinate when aligning needed pixels to the number # of pixels per clock if needed_pixels % pixel_per_clock == 0: oldest_needed_pixel_forward_1D_coordinates = needed_pixels else: oldest_needed_pixel_forward_1D_coordinates = ceil(needed_pixels / pixel_per_clock) * \ pixel_per_clock # adjust by 1 for 0 indexing oldest_needed_pixel_forward_1D_coordinates -= 1 def get_shift_register_location_in_1D_coordinates() -> int: return oldest_needed_pixel_forward_1D_coordinates - \ (index_in_shift_register * pixel_per_clock + current_shift_register) def set_shift_register_location_using_1D_coordinates( location: int) -> int: nonlocal current_shift_register, index_in_shift_register location_reversed_indexing = oldest_needed_pixel_forward_1D_coordinates - location index_in_shift_register = location_reversed_indexing // pixel_per_clock current_shift_register = location_reversed_indexing % pixel_per_clock used_coordinates = set() for current_window_index in range(cls.windows_per_active_clock): # stride is handled by wiring if there are multiple windows emitted per clock, # aka if stride is less than number of pixels per clock. # In this case, multiple windows are emitted but they must be overlapped # less than normal strideMultiplier = stride if stride < pixel_per_clock else 1 set_shift_register_location_using_1D_coordinates( strideMultiplier * current_window_index + # handle origin across multiple clocks by changing valid, but within a single clock # need to adjust where the windows start # need neg conversion twice due to issues taking mod of negative number ((origin * -1) % pixel_per_clock * -1)) for index_in_window in range(window_width): wire( shift_register.O[current_shift_register] [index_in_shift_register], cls.O[current_window_index][index_in_window]) used_coordinates.add( (index_in_shift_register, current_shift_register)) set_shift_register_location_using_1D_coordinates( get_shift_register_location_in_1D_coordinates() + 1) # if not last row, have output ports for ends of all shift_registers so next # 1D can accept them if not last_row: index_in_shift_register = image_size // pixel_per_clock - 1 for current_shift_register in range(pixel_per_clock): wire( shift_register.O[current_shift_register] [index_in_shift_register], cls.next_row[current_shift_register]) used_coordinates.add( (index_in_shift_register, current_shift_register)) # wire up all non-used coordinates to terms for sr in range(pixel_per_clock): for sr_index in range(image_size // pixel_per_clock): if (sr_index, sr) in used_coordinates: continue term = TermAnyType(pixel_type) wire(shift_register.O[sr][sr_index], term.I) # valid when the maximum coordinate used (minus origin, as origin can in # invalid space when emitting) gets data # add 1 here as coordinates are 0 indexed, and the denominator of this # fraction is the last register accessed # would add 1 outside fraction as it takes 1 clock for data # to get through registers but won't as 0 indexed valid_counter_max_value = ceil( (oldest_needed_pixel_forward_1D_coordinates + 1 + origin) / pixel_per_clock) # add 1 as sizedcounter counts to 1 less than the provided max valid_counter = SizedCounterModM(valid_counter_max_value + 1, has_ce=True) valid_counter_max_instance = DefineCoreirConst( len(valid_counter.O), valid_counter_max_value)() wire( enable( bit(cls.CE) & (valid_counter.O < valid_counter_max_instance.O)), valid_counter.CE) # if stride is greater than pixels_per_clock, then need a stride counter as # not active every clock. Invalid clocks create striding in this case if stride > pixel_per_clock: stride_counter = SizedCounterModM(stride // pixel_per_clock, has_ce=True) stride_counter_0 = DefineCoreirConst(len(stride_counter.O), 0)() wire( enable((stride_counter.O == stride_counter_0.O) & (valid_counter.O == valid_counter_max_instance.O)), cls.valid) # only increment stride if trying to emit data this clock cycle wire(valid_counter.O == valid_counter_max_instance.O, stride_counter.CE) else: wire((valid_counter.O == valid_counter_max_instance.O), cls.valid)
def definition(STBankGenerator): flat_idx_width = getRAMAddrWidth(no * ni) # next element each time_per_element clock if time_per_element > 1: index_in_cur_element = SizedCounterModM(time_per_element, has_ce=has_ce, has_reset=has_reset) next_element = Decode(time_per_element - 1, index_in_cur_element.O.N)( index_in_cur_element.O) else: next_element = DefineCoreirConst(1, 1)() # each element of the SSeq is a separate vector lane first_lane_flat_idx = DefineCounterModM(ni + ii, flat_idx_width, cout=False, has_ce=True, has_reset=has_reset)() wire(next_element.O[0], first_lane_flat_idx.CE) if has_ce: wire(STBankGenerator.CE, index_in_cur_element.CE) if has_reset: wire(STBankGenerator.RESET, index_in_cur_element.RESET) wire(STBankGenerator.RESET, first_lane_flat_idx.RESET) lane_flat_idxs = [first_lane_flat_idx.O] # compute the current flat_idx for each lane for i in range(1, no): cur_lane_flat_idx_adder = DefineAdd(flat_idx_width)() wire(cur_lane_flat_idx_adder.I0, first_lane_flat_idx.O) wire(cur_lane_flat_idx_adder.I1, DefineCoreirConst(flat_idx_width, i * ni)().O) lane_flat_idxs += [cur_lane_flat_idx_adder.O] lane_flat_div_lcms = [] lcm_dim = DefineCoreirConst(flat_idx_width, lcm(no, ni))() # conmpute flat_idx / lcm_dim for each lane for i in range(no): cur_lane_lcm_div = DefineUDiv(flat_idx_width)() wire(cur_lane_lcm_div.I0, lane_flat_idxs[i]) wire(cur_lane_lcm_div.I1, lcm_dim.O) lane_flat_div_lcms += [cur_lane_lcm_div.O] # compute ((flat_idx % sseq_dim) + (flat_idx / lcm_dim)) % sseq_dim for each lane # only need to mod sseq_dim at end as that is same as also doing it flat_idx before addition for i in range(no): pre_mod_add = DefineAdd(flat_idx_width)() wire(pre_mod_add.I0, lane_flat_idxs[i]) wire(pre_mod_add.I1, lane_flat_div_lcms[i]) bank_mod = DefineUMod(flat_idx_width)() wire(bank_mod.I0, pre_mod_add.O) wire(bank_mod.I1, DefineCoreirConst(flat_idx_width, no)().O) wire(STBankGenerator.bank[i], bank_mod.O[0:STBankGenerator.bank_width]) if len(bank_mod.O) > STBankGenerator.bank_width: bits_to_term = len(bank_mod.O) - STBankGenerator.bank_width term = TermAnyType(Array[bits_to_term, Bit]) wire(bank_mod.O[STBankGenerator.bank_width:], term.I) # compute flat_idx / sseq_dim for each lane addr for i in range(no): flat_idx_sseq_dim_div = DefineUDiv(flat_idx_width)() wire(flat_idx_sseq_dim_div.I0, lane_flat_idxs[0]) wire(flat_idx_sseq_dim_div.I1, DefineCoreirConst(flat_idx_width, no)().O) wire(STBankGenerator.addr[i], flat_idx_sseq_dim_div.O[0:STBankGenerator.addr_width]) if len(flat_idx_sseq_dim_div.O) > STBankGenerator.addr_width: bits_to_term = len(bank_mod.O) - STBankGenerator.addr_width term = TermAnyType(Array[bits_to_term, Bit]) wire(flat_idx_sseq_dim_div.O[STBankGenerator.addr_width:], term.I)
def definition(cls): # the counter of the current element of output sequence, when hits 0, load the next input to serialize element_idx_counter = SizedCounterModM(n + i_, has_ce=True, has_reset=has_reset) if element_idx_counter.O.N == math.ceil(math.log(n, 2)): element_idx_out = element_idx_counter.O else: used_bits_length = (math.ceil(math.log(n, 2))) unused_bits_length = element_idx_counter.O.N - used_bits_length element_idx_out = element_idx_counter.O[:used_bits_length] term = DefineTermAnyType(Array[unused_bits_length, Bit])() wire(element_idx_counter.O[used_bits_length:], term.I) is_first_element = Decode(0, element_idx_out.N)(element_idx_out) enabled = cls.valid_up valid_reg = DefineRegister(1)() wire(cls.valid_up, valid_reg.I[0]) wire(valid_reg.O[0], cls.valid_down) # if each element takes multiple clocks, need a ram so can write all them and read them over multiple clocks if is_nested(T) and T.time() > 1: value_store = [ DefineRAMAnyType(T.magma_repr(), T.time())() for _ in range(n - 1) ] value_store_input = [ram.WDATA for ram in value_store] value_store_output = [ram.RDATA for ram in value_store] time_per_element_counter = SizedCounterModM( T.time(), has_ce=True, has_reset=has_reset) go_to_next_element = Decode(T.time() - 1, time_per_element_counter.O.N)( time_per_element_counter.O) wire(time_per_element_counter.CE, enabled) wire(element_idx_counter.CE, enabled & go_to_next_element) for input_idx in range(n - 1): wire(value_store[input_idx].WE, is_first_element & enabled) # location in current element is where to read and write. # will write on first iteration through element, read on later iterations wire(time_per_element_counter.O, value_store[input_idx].WADDR) wire(time_per_element_counter.O, value_store[input_idx].RADDR) if has_reset: wire(time_per_element_counter.RESET, cls.RESET) else: value_store = [ DefineRegisterAnyType(T.magma_repr(), has_ce=True)() for _ in range(n - 1) ] value_store_input = [reg.I for reg in value_store] value_store_output = [reg.O for reg in value_store] wire(element_idx_counter.CE, enabled) for input_idx in range(n - 1): wire(value_store[input_idx].CE, is_first_element & enabled) for i in range(n - 1): wire(cls.I[i + 1], value_store_input[i]) # to serialize, go through all different rams/registers in value store # and select the output from the ith one, where i is current output element value_store_output_selector = DefineMuxAnyType(T.magma_repr(), n)() for i in range(n - 1): wire(value_store_output[i], value_store_output_selector.data[i + 1]) # just wiring this up to avoid any issues wire(value_store_output[0], value_store_output_selector.data[0]) wire(element_idx_out, value_store_output_selector.sel) # on first element, send the input directly out. otherwise, use the register first_element_output_selector = DefineMuxAnyType( T.magma_repr(), 2)() wire(is_first_element, first_element_output_selector.sel[0]) wire(value_store_output_selector.out, first_element_output_selector.data[0]) wire(cls.I[0], first_element_output_selector.data[1]) out_reg = DefineRegisterAnyType(cls.st_out_t.magma_repr())() wire(first_element_output_selector.out, out_reg.I) wire(out_reg.O, cls.O)