def run(a_shape=(15, 15), b_shape=(15, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') t = ng.add(a, b, dtype=c_dtype, par=par) c = ng.relu(t, dtype=c_dtype, par=par) targ = ng.to_veriloggen([c], 'matrix_add_relu', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10] vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6] - [10] eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(act_shape=(1, 7, 7, 7), weight_shape=(3, 3, 3, 7), bias_shape=None, scale_shape=None, act_dtype=ng.int32, weight_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, out_dtype=ng.int32, stride=(1, 1, 1, 1), rshift_mul=None, rshift_sum=None, rshift_out=0, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', input_ram_size=None, filter_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware act = ng.placeholder(act_dtype, shape=act_shape, name='act') weight = ng.variable(weight_dtype, shape=weight_shape, name='weight') if bias_shape is not None: bias = ng.variable(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.variable(scale_dtype, scale_shape, name='scale') else: scale = None out = ng.conv2d(act, weight, stride, bias, scale, rshift_mul, rshift_sum, rshift_out, None, 'SAME', out_dtype, ng.int32, ng.int32, 'conv2d', par_ich, par_och, par_col, par_row, concur_och, stationary, input_ram_size, filter_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) out = ng.relu(out) targ = ng.to_veriloggen([out], 'matrix_conv2d_relu', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data if act_dtype.width > 4: vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] else: vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] vweight = np.arange(weight.length, dtype=np.int64).reshape( weight.shape) % [7] - [3] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape( bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape( scale.shape) % [6] else: vscale = None eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale) vout = eval_outs[0] # to memory image size_max = int( math.ceil( max(act.memory_size, weight.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, out.memory_size) / 4096)) * 4096 check_addr = max( act.addr, weight.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, out.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) axi.set_memory( mem, vweight, memimg_datawidth, weight_dtype.width, weight.addr, max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich)) if bias is not None: axi.set_memory( mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och)) if scale is not None: axi.set_memory( mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och)) axi.set_memory( mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28): # create target hardware # layer 0: conv2d, max_pool_serial, relu input_layer = ng.placeholder(ng.int32, shape=(1, row, col, ich), name='input_layer') w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0') a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1)) a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1)) a0 = ng.relu(a0) # layer 1: conv2d, relu, reshape w1 = ng.variable(ng.int32, shape=(ch, ksize, ksize, a0.shape[-1]), name='w1') a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1)) a1 = ng.relu(a1) a1 = ng.reshape(a1, [-1]) # layer 2: full-connection w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2') a2 = ng.matmul(a1, w2, transposed_b=True) a2 = ng.relu(a2) # layer 3: full-connection w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3') output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer') targ = ng.to_veriloggen([output_layer], 'cnn') #targ = ng.to_ipxact([output_layer], 'cnn') # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) return m
def run(a_shape=(15, 15), b_shape=(15, 15), bias_shape=None, scale_shape=None, a_dtype=ng.int32, b_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, c_dtype=ng.int32, rshift_mul=None, rshift_sum=None, rshift_out=None, act_func=None, par_left_col=1, par_left_row=1, par_out_col=1, concur_out_col=None, stationary='right', left_ram_size=None, right_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') if bias_shape is not None: bias = ng.placeholder(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.placeholder(scale_dtype, scale_shape, name='scale') else: scale = None transposed_a = False transposed_b = True c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) c = ng.relu(c) targ = ng.to_veriloggen([c], 'matrix_matmul_relu', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape(bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape(scale.shape) % [6] else: vscale = None vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size, False, a_dtype, b_dtype, bias_dtype, scale_dtype) vc = ng.verify.relu(vc) # to memory image size_max = int(math.ceil(max(a.memory_size, b.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col)) if bias is not None: axi.set_memory(mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col)) if scale is not None: axi.set_memory(mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_out_col)) axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(c.shape[0]): for j in range(c.shape[1]): orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print(i, j, orig, check) ok = False if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(n_input=784, n_classes=10): # create target hardware x = ng.placeholder(ng.int32, shape=[n_input]) w1 = ng.variable(ng.int32, shape=(n_input, n_input), name='h1') w2 = ng.variable(ng.int32, shape=(n_input, n_input), name='h2') w3 = ng.variable(ng.int32, shape=(n_classes, n_input), name='out') l1 = ng.matmul(x, w1, transposed_b=True) l1 = ng.relu(l1) l2 = ng.matmul(l1, w2, transposed_b=True) l2 = ng.relu(l2) out = ng.matmul(l2, w3, transposed_b=True) targ = ng.to_veriloggen([out], 'mlp') #targ = ng.to_ipxact([model], 'mlp') # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model memory = axi.AxiMemoryModel(m, 'memory', clk, rst) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) return m