def mkLed(numthreads=8): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') mymutex = vthread.Mutex(m, 'mymutex', clk, rst) def myfunc(tid): print("-- Thread %d TryLock" % tid) lock = mymutex.try_lock() waitcount = 0 while not lock: print("-- Thread %d TryLock" % tid) waitcount += 1 lock = mymutex.try_lock() print("Thread %d Lock: waitcount=%d" % (tid, waitcount)) for i in range(20): pass # sleep print("Thread %d Hello" % tid) mymutex.unlock() print("Thread %d Unlock" % tid) def blink(): for tid in range(numthreads): pool.run(tid, tid) for tid in range(numthreads): pool.join(tid) th = vthread.Thread(m, 'th_blink', clk, rst, blink) pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('LED', 8, initval=0) count = m.Reg('count', 8, initval=0) def blink(times): led.value = 0 count.value = 0 for i in range(times): for x in range(8): # pointer led.value[x] = count[x] print("led = ", led) count.value += 1 led.value = 0 count.value = 0 for i in range(times): # slice led.value = count[0:2] print("led = ", led) count.value += 1 led.value = 0 count.value = 0 for i in range(times): # slice with step led.value = count[0:8:2] print("led = ", led) count.value += 1 th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('LED', 8, initval=0) count = m.Reg('count', 8, initval=0) seq = Seq(m, 'seq', clk, rst) seq( count.inc() ) def blink(times): led.value = 0 for i in range(times): led.value = count + global_value print("led = ", led) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.Reg('LED', 8, initval=0) count = fx.FixedReg(m, 'count', 8, point=3, initval=0) seq = Seq(m, 'seq', clk, rst) seq(count.inc()) def blink(times): led.value = 0 next_val = vthread.fixed.FixedConst(0, 8) for i in range(times): next_val = next_val + vthread.fixed.FixedConst(0.5, 8) led.value = next_val.int_part print("led = ", led) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numports = 1 initvals = [i * 0.5 + 10 for i in range(2 ** addrwidth - 100)] myram = vthread.FixedRAM(m, 'myram', clk, rst, datawidth, addrwidth, point=8, numports=numports, initvals=initvals) def blink(times): for i in range(times): rdata = myram.read(i) print('rdata = %f' % rdata) for i in range(times): rdata = myram.read(i) b = vthread.fixed.FixedConst(0.25, 8) wdata = rdata + b myram.write(i, wdata) print('wdata = %f' % wdata) sum = vthread.fixed.FixedConst(0, 8) for i in range(times): rdata = myram.read(i) print('rdata = %f' % rdata) sum += rdata print('sum = %f' % sum) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 myram = vthread.MultibankRAM(m, 'myram', clk, rst, datawidth, addrwidth, numbanks=numbanks) def blink(times): wdata = 0 for i in range(times): for b in range(numbanks): myram.write_bank(b, i, wdata) print('bank:%d wdata = %d' % (b, wdata)) wdata += 1 sum = 0 for i in range(times): for b in range(numbanks): rdata = myram.read_bank(b, i) sum += rdata print('bank:%d rdata = %d' % (b, rdata)) print('sum = %d' % sum) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(numthreads=8): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.Output('LED', 8) count = vthread.Shared(m.Reg('count', 32, initval=0)) led.assign(count.value) def myfunc(tid): count.lock() print("Thread %d Lock" % tid) for i in range(20): pass # sleep count.write(count.value + 1) print("Thread %d count = %d" % (tid, count.value)) count.unlock() print("Thread %d Unlock" % tid) def blink(): count.write(0) for tid in range(numthreads): pool.run(tid, tid) for tid in range(numthreads): pool.join(tid) print("result count = %d" % count.value) th = vthread.Thread(m, 'th_blink', clk, rst, blink) pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads) fsm = th.start() return m
def mkLed(baudrate=19200, clockfreq=100 * 1000 * 1000): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') sw = m.Input('sw', 16) led = m.OutputReg('led', 16, initval=0) tx = m.Output('utx') rx = m.Input('urx') uart_tx = UartTx(m, 'inst_tx', 'tx_', clk, rst, tx, baudrate=baudrate, clockfreq=clockfreq) uart_rx = UartRx(m, 'inst_rx', 'rx_', clk, rst, rx, baudrate=baudrate, clockfreq=clockfreq) def blink(): while True: c = uart_rx.recv() data = c + sw led.value = data uart_tx.send(data) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start() return m
def mkLed(numthreads=8): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') mymutex = vthread.Mutex(m, 'mymutex', clk, rst) def myfunc(tid): mymutex.lock() print("Thread %d Lock" % tid) for i in range(20): pass # sleep print("Thread %d Hello" % tid) mymutex.unlock() print("Thread %d Unlock" % tid) def blink(): for tid in range(numthreads): pool.run(tid, tid) for tid in range(numthreads): pool.join(tid) th = vthread.Thread(m, 'th_blink', clk, rst, blink) pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads, fsm_as_module=True) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') data = m.Reg('data', 8, initval=0) enable = m.Reg('enable', initval=0) ready = m.Wire('ready') ready.assign(1) def send(fsm, value): fsm(data(value), enable(1), Display("data = %d", value)) fsm.goto_next() fsm(enable(0)) fsm.goto_next() return 0 def wait(fsm): fsm.If(ready).goto_next() return 0 def blink(times): for i in range(times): #data = i + 100 data = vthread.verilog.Plus(i, 100) send(data) wait() th = vthread.Thread(m, 'th_blink', clk, rst, blink) # add intrinsics th.add_intrinsics(send, wait) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) mulstrm = vthread.Stream(m, 'mul_stream', clk, rst) mulx = mulstrm.source('x') muly = mulstrm.source('y') mulz = mulx * muly mulstrm.sink(mulz, 'z') macstrm = vthread.Stream(m, 'mac_stream', clk, rst) a = macstrm.source('a') b = macstrm.source('b') a = a + 1 b = b + 1 sub = macstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = macstrm.constant('size') sum, sum_valid = macstrm.ReduceAddValid(c, size) macstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') actstrm = vthread.Stream(m, 'act_stream', clk, rst) a = actstrm.source('a') b = actstrm.source('b') a = a + 1 b = b + 1 a = a + 1 b = b + 1 sub = actstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = actstrm.constant('size') sum, sum_valid = actstrm.ReduceAddValid(c, size) sum = actstrm.Mux(sum > 0, sum, 0) actstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def comp_stream_mul(size, offset): mulstrm.set_source('x', ram_a, offset, size) mulstrm.set_source('y', ram_b, offset, size) mulstrm.set_sink('z', ram_c, offset, size) mulstrm.run() mulstrm.join() def comp_stream_mac(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('size', size) macstrm.set_sink('sum', ram_c, offset, 1) macstrm.run() macstrm.join() def comp_stream_act(size, offset): actstrm.set_source('a', ram_a, offset, size) actstrm.set_source('b', ram_b, offset, size) actstrm.set_constant('size', size) actstrm.set_sink('sum', ram_c, offset, 1) actstrm.run() actstrm.join() def comp_sequential_mul(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a * b ram_c.write(i + offset, sum) def comp_sequential_mac(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 1 b = ram_b.read(i + offset) + 1 sum += a * b ram_c.write(offset, sum) def comp_sequential_act(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 2 b = ram_b.read(i + offset) + 2 sum += a * b if sum <= 0: sum = 0 ram_c.write(offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # mul # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# MUL') check(size, 0, offset) # mac # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) # mac 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) size = strm.constant('size') cnt, valid = strm.CounterValid(size) a = strm.source('a') b = strm.source('b') cntval = strm.Mux(valid, 1000, cnt) c = a + b + cntval strm.sink(c, 'c') def comp_stream(size, offset): strm.set_constant('size', size // 2) strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 cnt = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b + cnt cnt += 1 if cnt == 1001: cnt = 0 if cnt == size // 2 - 1: cnt = 1000 ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b v = strm.Ands(c > 140, c < 150) cnt = strm.ReduceAdd(v) strm.sink(c, 'c', when=v, when_name='v') strm.sink(cnt, 'cnt') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, 0) # max_size strm.set_sink_immediate('cnt', 0) # max_size strm.run() strm.join() cnt = strm.read_sink('cnt') print('# num of counted: %d' % cnt) return cnt def comp_sequential(size, offset): sum = 0 addr = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) c = a + b if c > 140 and c < 150: ram_c.write(addr + offset, c) addr += 1 print('# num of counted: %d' % addr) return addr def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, cnt) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, cnt) # verification myaxi.dma_read(ram_c, 0, 1024, cnt) myaxi.dma_read(ram_c, offset, 1024 * 2, cnt) check(cnt, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def run(a_shape=(15, 15), b_shape=(15, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') t = ng.add(a, b, dtype=c_dtype, par=par) c = ng.relu(t, dtype=c_dtype, par=par) targ = ng.to_veriloggen([c], 'matrix_add_relu', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10] vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6] - [10] eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True # Test for 4KB boundary check offset = myaxi.boundary_size - 4 body(size, offset) if all_ok: print('ALL OK') def body(size, offset): # write for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for i in range(size): wdata = i + 1000 myram.write(i, wdata) laddr = 0 gaddr = offset + myaxi.boundary_size myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 100): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = offset + myaxi.boundary_size myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 1000): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(256 + 256 + 64) return m
def mkLed(word_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, word_datawidth, addrwidth, numports=2) axi_in = vthread.AXIStreamInFifo(m, 'axi_in', clk, rst, datawidth, with_last=True, noio=True) axi_out = vthread.AXIStreamOutFifo(m, 'axi_out', clk, rst, datawidth, with_last=True, noio=True) maxi_in = vthread.AXIM_for_AXIStreamIn(axi_in, 'maxi_in') maxi_out = vthread.AXIM_for_AXIStreamOut(axi_out, 'maxi_out') fifo_addrwidth = 8 fifo_in = vthread.FIFO(m, 'fifo_in', clk, rst, word_datawidth, fifo_addrwidth) fifo_out = vthread.FIFO(m, 'fifo_out', clk, rst, word_datawidth, fifo_addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - (word_datawidth // 8)) body(size, offset) print('# iter %d end' % i) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() def body(size, offset): # write a test vector for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size, port=1) # AXI-stream read -> FIFO -> FIFO -> AXI-stream write maxi_in.dma_read_async(gaddr, size * (word_datawidth // datawidth)) axi_in.write_fifo(fifo_in, size) for i in range(size): va = fifo_in.deq() fifo_out.enq(va) out_gaddr = (size + size) * (word_datawidth // 8) + offset maxi_out.dma_write_async(out_gaddr, size * (word_datawidth // datawidth)) axi_out.read_fifo(fifo_out, size) # check myaxi.dma_read(myram, 0, gaddr, size, port=1) myaxi.dma_read(myram, size, out_gaddr, size, port=1) for i in range(size): v0 = myram.read(i) v1 = myram.read(i + size) if vthread.verilog.NotEql(v0, v1): all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(17) return m
def run(a_shape=(7, 15), b_shape=(7, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # pytorch model model = MatrixMul() # Pytorch to ONNX onnx_filename = 'onnx_matrix_mul.onnx' dummy_a = torch.randn(*a_shape) dummy_b = torch.randn(*b_shape) dummy_inputs = (dummy_a, dummy_b) input_names = ['a', 'b'] output_names = ['c'] model.eval() torch.onnx.export(model, dummy_inputs, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=ng.int32, default_variable_dtype=ng.int32, default_constant_dtype=ng.int32, default_operator_dtype=ng.int32, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=False) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- input_scale_factors = {'a': 10.0, 'b': 15.0} ng.quantize(outputs, input_scale_factors) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.scaled_multiply): op.attribute(par=par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- a = placeholders['a'] b = placeholders['b'] c = outputs['c'] # verification data input_a = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [17] input_b = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [13] # execution on pytorch model_a = input_a.astype(np.float32) if a.perm is not None: model_a = np.transpose(model_a, a.reversed_perm) model_b = input_b.astype(np.float32) if b.perm is not None: model_b = np.transpose(model_b, b.reversed_perm) model.eval() model_c = model(torch.from_numpy(model_a), torch.from_numpy(model_b)).detach().numpy() if a.perm is not None: model_c = np.transpose(model_c, a.perm) scaled_model_c = model_c * c.scale_factor # software-based verification va = input_a * input_scale_factors['a'] va = np.clip(va, -1.0 * (2**(a.dtype.width - 1) - 1), 1.0 * (2**(a.dtype.width - 1) - 1)) va = np.round(va).astype(np.int64) vb = input_b * input_scale_factors['b'] vb = np.clip(vb, -1.0 * (2**(b.dtype.width - 1) - 1), 1.0 * (2**(b.dtype.width - 1) - 1)) vb = np.round(vb).astype(np.int64) eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] mean_square_error = np.sum((vc - scaled_model_c)**2) / vc.size corrcoef = np.corrcoef(model_c.reshape([-1]), vc.reshape([-1])) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- targ = ng.to_veriloggen([c], 'onnx_matrix_mul', silent=silent, config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([c]) param_bytes = len(param_data) variable_addr = int( math.ceil( max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096 check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096 tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096 memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15), bias_shape=None, scale_shape=None, act_dtype=ng.int32, weight_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, out_dtype=ng.int32, conv2d_stride=(1, 1, 1, 1), rshift_mul=None, rshift_sum=None, rshift_out=None, act_func=None, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', input_ram_size=None, filter_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware act = ng.placeholder(act_dtype, shape=act_shape, name='act') weight = ng.variable(weight_dtype, shape=weight_shape, name='weight') if bias_shape is not None: bias = ng.variable(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.variable(scale_dtype, scale_shape, name='scale') else: scale = None tmp = ng.conv2d(act, weight, conv2d_stride, bias, scale, rshift_mul, rshift_sum, rshift_out, act_func, 'SAME', out_dtype, ng.int32, ng.int32, 'conv2d', par_ich, par_och, par_col, par_row, concur_och, stationary, input_ram_size, filter_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) out = ng.avg_pool(tmp, ksize=ksize, strides=pool_stride, sum_dtype=ng.int32, dtype=out_dtype, par=par) targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16] vweight = np.arange(weight.length, dtype=np.int64).reshape(weight.shape) % [32] - [16] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape(bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape(scale.shape) % [6] else: vscale = None eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale) vout = eval_outs[0] # to memory image size_max = int(math.ceil(max(act.memory_size, weight.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, out.memory_size) / 4096)) * 4096 check_addr = max(act.addr, weight.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, out.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) axi.set_memory(mem, vweight, memimg_datawidth, weight_dtype.width, weight.addr, max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich)) if bias is not None: axi.set_memory(mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och)) if scale is not None: axi.set_memory(mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och)) axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) cnt1 = strm.Counter() cnt2 = strm.Counter(initval=1) cnt3 = strm.Counter(initval=2, size=5) cnt4 = strm.Counter(initval=3, interval=3) cnt5 = strm.Counter(initval=4, interval=3, size=7) cnt6 = strm.Counter(initval=4, step=2, interval=2) a = strm.source('a') b = strm.source('b') c = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): cnt = 0 for i in range(size): cnt1 = cnt cnt2 = 1 + cnt cnt3 = (cnt + 2) % 5 cnt4 = (cnt // 3) + 3 cnt5 = ((cnt // 3) + 4) % 7 cnt6 = (cnt // 2) * 2 + 4 a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 ram_c.write(i + offset, sum) cnt += 1 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def run(act_shape=(1, 4, 4, 3), weight0_shape=(9, 3, 3, 3), weight1_shape=(9, 36), act_dtype=ng.int32, weight_dtype=ng.int32, stride0=1, padding0=0, with_batchnorm0=False, with_batchnorm1=False, act_func0='ReLU', act_func1='relu', disable_fusion=False, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # pytorch model layers = [] layers.append( nn.Conv2d(weight0_shape[3], weight0_shape[0], weight0_shape[1], stride=stride0, padding=padding0)) if with_batchnorm0: layers.append(nn.BatchNorm2d(weight0_shape[0])) if act_func0 is not None: layers.append(getattr(nn, act_func0)()) class Transpose(nn.Module): def __init__(self, perm): super(Transpose, self).__init__() self.perm = perm def forward(self, input): return input.permute(*self.perm) layers.append(Transpose([0, 1, 3, 2])) class Flatten(nn.Module): def forward(self, input): # return input.view(input.size(0), -1) return torch.reshape(input, (input.size(0), -1)) layers.append(Flatten()) layers.append(nn.Linear(weight1_shape[1], weight1_shape[0])) if with_batchnorm1: layers.append(nn.BatchNorm2d(weight1_shape[0])) if act_func1 is not None: layers.append(getattr(nn, act_func1)()) model = nn.Sequential(*layers) # Pytorch to ONNX onnx_filename = 'onnx_matrix_conv2d_transpose_linear.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen value_dtypes = { 'act': act_dtype, '0.weight': weight_dtype, '3.weight': weight_dtype, 'out': act_dtype } (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=disable_fusion) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} ng.quantize(outputs, input_scale_factors) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=par_ich, par_och=par_och, par_row=par_row, par_col=par_col, concur_och=concur_och) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data # random data std = 0.2 mean = 0.5 img = np.random.normal(size=act.length).astype(np.float32).reshape( act.shape) img = img * std + mean # execution on pytorch model_input = img if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1), 1.0 * (2**(act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- targ = ng.to_veriloggen([out], 'onnx_matrix_conv2d_transpose_linear', silent=silent, config={ 'maxi_datawidth': axi_datawidth, 'chunk_size': chunk_size }) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out], chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(out.shape[0]): for j in range(out.shape[1]): orig = memory.read_word(i * out.aligned_shape[1] + j, out.addr, act_dtype.width) check = memory.read_word(i * out.aligned_shape[1] + j, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', i, j, ') orig: ', orig, 'check: ', check) ok = False # else: # print('OK (', i, j, ') orig: ', orig, 'check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) myrams = [vthread.RAM(m, 'myram_%d' % i, clk, rst, datawidth, addrwidth) for i in range(numbanks)] myram = vthread.MultibankRAM(rams=myrams, name='myram') all_ok = m.TmpReg(initval=0) array_len = 16 array_size = (array_len + array_len) * 4 * numbanks def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - 4) body(size, offset) print('# iter %d end' % i) if all_ok: print('ALL OK') def body(size, offset): # write for bank in range(numbanks): for i in range(size): wdata = i + 100 + bank myram.write_bank(bank, i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for bank in range(numbanks): for i in range(size): wdata = i + 1000 + bank myram.write_bank(bank, i, wdata) laddr = 0 gaddr = array_size + offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for bank in range(numbanks): for i in range(size): rdata = myram.read_bank(bank, i) if vthread.verilog.NotEql(rdata, i + 100 + bank): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = array_size + offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for bank in range(numbanks): for i in range(size): rdata = myram.read_bank(bank, i) if vthread.verilog.NotEql(rdata, i + 1000 + bank): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(array_len) return m
def mkTest(baudrate=19200, clockfreq=19200 * 10): m = Module('test') # target instance led = mkLed(baudrate, clockfreq) uut = Submodule(m, led, name='uut', prefix='', as_wire=('utx', 'urx')) clk = uut['CLK'] rst = uut['RST'] tx = uut['utx'] rx = uut['urx'] sw = uut['sw'] uart_tx = UartTx(m, 'inst_tx', 'tx_', clk, rst, as_wire='txd', baudrate=baudrate, clockfreq=clockfreq) uart_rx = UartRx(m, 'inst_rx', 'rx_', clk, rst, as_wire='rxd', baudrate=baudrate, clockfreq=clockfreq) txd = uart_tx['txd'] rxd = uart_rx['rxd'] rx.assign(txd) rxd.assign(tx) #simulation.setup_waveform(m, uut, uart_tx, uart_rx) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add(sw(10), Delay(1000000), Systask('finish')) all_ok = m.TmpReg(initval=0) def test(): all_ok = True for i in range(10): s = 100 + i uart_tx.send(s) r = uart_rx.recv() if vthread.verilog.Eql(r, s + sw): print('OK: %d + %d === %d' % (s, sw, r)) else: print('NG: %d + %d !== %d' % (s, sw, r)) all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'test', clk, rst, test) th.start() return m
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - memory_datawidth // 8) body(size, offset) print('# iter %d end' % i) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() def body(size, offset): # write for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for i in range(size): wdata = i + 1000 myram.write(i, wdata) laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 100): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 1000): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(17) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 ram_addrwidth = addrwidth - int(math.log(addrwidth, 2)) myram = vthread.MultibankRAM(m, 'myram', clk, rst, datawidth, ram_addrwidth, numbanks=numbanks, numports=2) read_size = 10 write_size = read_size write_done = m.Reg('write_done', initval=0) addr = m.Reg('addr', addrwidth, initval=0) wdata = m.Reg('wdata', datawidth, initval=0) wenable = m.Reg('wenable', initval=0) rdata = m.Wire('rdata', datawidth) sum = m.Reg('sum', datawidth, initval=0) fsm = FSM(m, 'fsm', clk, rst) fsm.If(write_done).goto_next() # write fsm( addr(-1), wdata(-1), wenable(0) ) fsm.goto_next() fsm( addr.inc(), wdata.inc(), wenable(1) ) fsm.Delay(1)( Display('wdata = %d', wdata), wenable(0) ) fsm.If(addr == write_size - 2).goto_next() # read fsm( addr(-1), wenable(0) ) fsm.goto_next() fsm( addr.inc() ) fsm.Delay(2)( sum.add(rdata), Display('rdata = %d', rdata) ) fsm.If(addr == read_size - 2).goto_next() fsm.goto_next() fsm.goto_next() # sum fsm( Display('sum = %d', sum) ) fsm.goto_next() # connect ports to RAM myram.connect_rtl(1, addr, wdata, wenable, rdata) def blink(times): write_done.value = 0 for i in range(times): wdata = i + 100 myram.write(i, wdata) print('wdata = %d' % wdata) write_done.value = 1 th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(read_size) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) shape = [16, 4, 8] size = functools.reduce(lambda x, y: x * y, shape, 1) order = [1, 2, 0] def to_pattern(shape, order): pattern = [] for p in order: size = shape[p] stride = functools.reduce(lambda x, y: x * y, shape[p + 1:], 1) pattern.append((size, stride)) return pattern pattern_a = to_pattern(shape, order) pattern_b = to_pattern(shape, order) pattern_c = to_pattern(shape, order) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset, pattern_a) strm.set_source_pattern('b', ram_b, offset, pattern_b) strm.set_sink_pattern('c', ram_c, offset, pattern_c) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b ram_c.write(i + offset, sum) def check(offset_stream, offset_seq): all_ok = True st = ram_c.read(offset_stream) sq = ram_c.read(offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('OK') else: print('NG') def comp(): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) # verification check(0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 reduce_size = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth) macstrm = vthread.Stream(m, 'macstream', clk, rst) macstrm_a = macstrm.source('a') macstrm_b = macstrm.source('b') macstrm_const = macstrm.constant('const') macstrm_mul = macstrm_a * macstrm_b macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const) macstrm_v += 0 macstrm.sink(macstrm_c, 'c') macstrm.sink(macstrm_v, 'v') strm = vthread.Stream(m, 'mystream', clk, rst) x = strm.source('x') y = strm.source('y') const = strm.constant('const') sub = strm.substream(macstrm) sub.to_source('a', x) sub.to_source('b', y) sub.to_constant('const', const) z = sub.from_sink('c') v = sub.from_sink('v') z = z + x strm.sink(z, 'z', when=v, when_name='v') def comp_stream_macstrm(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('const', reduce_size) macstrm.set_sink('c', ram_c, offset, size) macstrm.set_sink('v', ram_d, offset, size) macstrm.run() macstrm.join() def comp_stream_mystrm(size, offset): strm.set_source('x', ram_a, offset, size) strm.set_source('y', ram_b, offset, size) strm.set_constant('const', reduce_size) strm.set_sink('z', ram_c, offset, size // reduce_size) strm.run() strm.join() def comp_sequential_macstrm(size, offset): sum = 0 count = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum += a * b count += 1 ram_c.write(i + offset, sum) ram_d.write(i + offset, count == (reduce_size - 1)) if count == reduce_size: sum = 0 count = 0 def comp_sequential_mystrm(size, offset): sum = 0 count = 0 write_offset = offset for i in range(size): x = ram_a.read(i + offset) y = ram_b.read(i + offset) sum += x * y val = sum + x count += 1 if count == reduce_size: ram_c.write(write_offset, val) write_offset += 1 sum = 0 count = 0 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# macstream') check(size, 0, offset) # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size // reduce_size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size) # verification print('# mystream') check(size // reduce_size, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(16) return m
def run( act_dtype=ng.int16, weight_dtype=ng.int8, bias_dtype=ng.int32, scale_dtype=ng.int8, with_batchnorm=True, disable_fusion=False, conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1, conv2d_concur_och=None, conv2d_stationary='filter', pool_par=1, elem_par=1, chunk_size=64, axi_datawidth=32, silent=False, filename=None, # simtype='iverilog', # simtype='verilator', simtype=None, # no RTL simulation outputfile=None): # input mean and standard deviation imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32) imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32) act_shape = (1, 224, 224, 3) if not with_batchnorm: raise ValueError('with_batchnorm must be True for ResNet18.') # pytorch model model = torchvision.models.resnet18(pretrained=True) # Pytorch to ONNX onnx_filename = 'resnet18_imagenet.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen dtypes = {} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=scale_dtype, default_bias_dtype=bias_dtype, disable_fusion=disable_fusion) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} input_means = {'act': imagenet_mean * act_scale_factor} input_stds = {'act': imagenet_std * act_scale_factor} ng.quantize(outputs, input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=conv2d_par_ich, par_och=conv2d_par_och, par_col=conv2d_par_col, par_row=conv2d_par_row, concur_och=conv2d_concur_och, stationary=conv2d_stationary) if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial, ng.max_pool_serial)): op.attribute(par=pool_par) if ng.is_elementwise_operator(op): op.attribute(par=elem_par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data img = np.array(PIL.Image.open('car.png').convert('RGB')).astype(np.float32) img = img.reshape([1] + list(img.shape)) img = img / 255 img = (img - imagenet_mean) / imagenet_std # execution on pytorch model_input = np.broadcast_to(img, act_shape) if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1), 1.0 * (2**(act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) vact = np.broadcast_to(vact, act_shape) # compare outputs of hidden layers relu_op = [ v for k, v in operators.items() if isinstance(v, ng.conv2d) and not isinstance(v, ng.matmul) ][0] maxpool_op = [ v for k, v in operators.items() if isinstance(v, (ng.max_pool, ng.max_pool_serial)) ][0] relu_ops = [v for k, v in operators.items() if isinstance(v, ng.relu)] layer1_0_op = relu_ops[0] layer1_op = relu_ops[1] layer2_0_op = relu_ops[2] layer2_op = relu_ops[3] layer3_0_op = relu_ops[4] layer3_op = relu_ops[5] layer4_0_op = relu_ops[6] layer4_op = relu_ops[7] avgpool_op = [ v for k, v in operators.items() if isinstance(v, (ng.avg_pool, ng.avg_pool_serial)) ][0] fc_op = [v for k, v in operators.items() if isinstance(v, ng.matmul)][0] sub_ops = [ relu_op, maxpool_op, layer1_0_op, layer1_op, layer2_0_op, layer2_op, layer3_0_op, layer3_op, layer4_0_op, layer4_op, avgpool_op, fc_op ] sub_outs = ng.eval(sub_ops, act=vact) sub_outs = [sub_out.transpose([0, 3, 1, 2]) for sub_out in sub_outs[:-1]] + sub_outs[-1:] sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops] model.eval() model_relu_out = nn.Sequential(model.conv1, model.bn1, model.relu)( torch.from_numpy(model_input)).detach().numpy() model_maxpool_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool)(torch.from_numpy(model_input)).detach().numpy() # class model_layer1_0(nn.Module): # def __init__(self): # super(model_layer1_0, self).__init__() # self.conv1 = model.conv1 # self.bn1 = model.bn1 # self.relu = model.relu # self.maxpool = model.maxpool # self.layer1_0 = model.layer1[0] # # def forward(self, x): # x = self.relu(self.bn1(self.conv1(x))) # x = self.maxpool(x) # x = self.layer1_0(x) # return x # # model_layer1_0_out = model_layer1_0()(torch.from_numpy(model_input)).detach().numpy() model_layer1_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1[0])(torch.from_numpy(model_input)).detach().numpy() model_layer1_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1)(torch.from_numpy(model_input)).detach().numpy() model_layer2_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2[0])(torch.from_numpy(model_input)).detach().numpy() model_layer2_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2)(torch.from_numpy(model_input)).detach().numpy() model_layer3_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3[0])(torch.from_numpy(model_input)).detach().numpy() model_layer3_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3)(torch.from_numpy(model_input)).detach().numpy() model_layer4_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4[0])(torch.from_numpy(model_input)).detach().numpy() model_layer4_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4)(torch.from_numpy(model_input)).detach().numpy() model_avgpool_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool)(torch.from_numpy(model_input)).detach().numpy() class Flatten(nn.Module): def forward(self, input): return input.view(input.size(0), -1) model_fc_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool, Flatten(), model.fc)(torch.from_numpy(model_input)).detach().numpy() model_outs = [ model_relu_out, model_maxpool_out, model_layer1_0_out, model_layer1_out, model_layer2_0_out, model_layer2_out, model_layer3_0_out, model_layer3_out, model_layer4_0_out, model_layer4_out, model_avgpool_out, model_fc_out ] scaled_outs = [ model_out * scale_factor for model_out, scale_factor in zip(model_outs, sub_scale_factors) ] max_diffs = [ model_out.max() / sub_out.max() for model_out, sub_out in zip(scaled_outs, sub_outs) ] overflows = [ np.sum(np.abs(sub_out) >= abs(2**(sub_op.dtype.width - 1) - 1)) for sub_op, sub_out in zip(sub_ops, sub_outs) ] mean_square_errors = [ np.sum((sub_out - model_out)**2) / sub_out.size for model_out, sub_out in zip(scaled_outs, sub_outs) ] corrcoefs = [ np.corrcoef(model_out.reshape([-1]), sub_out.reshape([-1])) for model_out, sub_out in zip(model_outs, sub_outs) ] # compare prediction results eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) class_index = json.load(open('imagenet_class_index.json', 'r')) labels = {int(key): value for (key, value) in class_index.items()} mout = scaled_model_out for bat in range(mout.shape[0]): m_top10 = list( sorted(enumerate(mout[bat]), key=lambda x: x[1], reverse=True))[:10] m_top10_indexes = [index for index, value in m_top10] v_top10 = list( sorted(enumerate(vout[bat]), key=lambda x: x[1], reverse=True))[:10] v_top10_indexes = [index for index, value in v_top10] num_hit = 0 score = 0 for index, value in m_top10: print("# mout: %s (%d) = %f" % (str(labels[index]), index, value)) for index, value in v_top10: print("# vout: %s (%d) = %d" % (str(labels[index]), index, value)) if index in m_top10_indexes: num_hit += 1 score += 10 - abs( m_top10_indexes.index(index) - v_top10_indexes.index(index)) print("# top-10 hit: %d" % num_hit) print("# top-10 score: %d" % score) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen([out], 'resnet18', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact([out], 'resnet18', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog([out], 'resnet18', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out], chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)], dtype=np.int16) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for x in range(out.shape[1]): orig = memory.read_word(bat * out.aligned_shape[1] + x, out.addr, act_dtype.width) check = memory.read_word(bat * out.aligned_shape[1] + x, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False else: print('OK (', bat, x, ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) ram_a = vthread.MultibankRAM(m, 'ram_a', clk, rst, datawidth, addrwidth, numbanks=numbanks) ram_b = vthread.MultibankRAM(m, 'ram_b', clk, rst, datawidth, addrwidth, numbanks=numbanks) ram_c = vthread.MultibankRAM(m, 'ram_c', clk, rst, datawidth, addrwidth, numbanks=numbanks) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): dma_size = size comp_size = size * numbanks dma_offset = 0 comp_offset = 0 myaxi.dma_read(ram_a, dma_offset, 0, dma_size) myaxi.dma_read(ram_b, dma_offset, 0, dma_size) comp_stream(size, comp_offset) myaxi.dma_write(ram_c, dma_offset, 1024, dma_size) dma_offset = size comp_offset = comp_size myaxi.dma_read(ram_a, dma_offset, 0, dma_size) myaxi.dma_read(ram_b, dma_offset, 0, dma_size) comp_sequential(size, comp_offset) myaxi.dma_write(ram_c, dma_offset, 1024 * 2, dma_size) check(comp_size, 0, comp_offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) size = 16 pattern = [(size, 0)] strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') sum = a + b strm.sink(sum, 'sum') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset + 10, pattern) strm.set_source_pattern('b', ram_b, offset + 10, pattern) strm.set_sink('sum', ram_c, offset, size) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(offset + 10) b = ram_b.read(offset + 10) sum = a + b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(matrix_size=16): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') seq = Seq(m, 'seq', clk, rst) timer = m.Reg('timer', 32, initval=0) seq(timer.inc()) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) def matmul(matrix_size, a_offset, b_offset, c_offset): start_time = timer comp(matrix_size, a_offset, b_offset, c_offset) end_time = timer time = end_time - start_time print("Time (cycles): %d" % time) check(matrix_size, a_offset, b_offset, c_offset) def strm_madd(strm, size, waddr): a = strm.read(ram_a, 0, size) b = strm.read(ram_b, 0, size) sum, valid = strm.RegionAdd(a * b, size) strm.write(ram_c, waddr, 1, sum, when=valid) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): myaxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): myaxi.dma_read(ram_b, 0, b_addr, matrix_size) stream.run(matrix_size, j) stream.join() b_addr += matrix_size * (datawidth // 8) myaxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) def check(matrix_size, a_offset, b_offset, c_offset): all_ok = True c_addr = c_offset for i in range(matrix_size): myaxi.dma_read(ram_c, 0, c_addr, matrix_size) for j in range(matrix_size): v = ram_c.read(j) if i == j and vthread.verilog.NotEql(v, (i + 1) * 2): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) if i != j and vthread.verilog.NotEql(v, 0): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) c_addr += matrix_size * (datawidth // 8) if all_ok: print("OK") else: print("NG") stream = vthread.Stream(m, 'strm_madd', clk, rst, strm_madd) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start(matrix_size, 0, 1024, 2048) return m