Пример #1
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')
    x = ng.add(a,
               ng.full_imm((1, ), 10, dtype=c_dtype, par=par),
               dtype=c_dtype,
               par=par)
    y = ng.add(a,
               ng.full_imm((1, ), 20, dtype=c_dtype, par=par),
               dtype=c_dtype,
               par=par)
    z0 = ng.add(x, y, dtype=c_dtype, par=par)
    z0.output_chainable = False
    x = ng.add(b,
               ng.full_imm((1, ), 100, dtype=c_dtype, par=par),
               dtype=c_dtype,
               par=par)
    y = ng.add(b,
               ng.full_imm((1, ), 200, dtype=c_dtype, par=par),
               dtype=c_dtype,
               par=par)
    z1 = ng.add(x, y, dtype=c_dtype, par=par)
    z1.output_chainable = False
    c = ng.add(z0, z1, dtype=c_dtype, par=par)

    targ = ng.to_veriloggen([c],
                            'matrix_full',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Пример #2
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        interrupt_name='irq',
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    d = ng.add(a, b, dtype=c_dtype, par=par)
    e = ng.add(b, a, dtype=c_dtype, par=par)

    # SW returns ng.add(x, y)
    f = ng.extern([d, e], shape=a_shape, opcode=0x1, func=lambda x, y: x + y)
    g = ng.sub(f, a)

    # SW returns d as-is
    h = ng.extern([g], shape=a_shape, opcode=0x2, func=lambda x: x)
    c = ng.sub(h, b)

    targ = ng.to_veriloggen([c],
                            'matrix_extern',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'interrupt_name': interrupt_name
                            })

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    irq = ports[interrupt_name]
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg_datawidth=memimg_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def irq_join(saxi, irq_bit):
        while irq == 0:
            pass
        araddr = ng.control_reg_interrupt_isr * 4
        irq_stat = saxi.read(araddr)

        if irq_stat != irq_bit:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()

        print('# irq stat = %d' % irq_stat)
        awaddr = ng.control_reg_interrupt_iar * 4
        saxi.write(awaddr, irq_bit)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        araddr_ext_snd = ng.control_reg_extern_send * 4
        awaddr_ext_rcv = ng.control_reg_extern_recv * 4
        awaddr_irq_ier = ng.control_reg_interrupt_ier * 4
        araddr_irq_isr = ng.control_reg_interrupt_isr * 4
        awaddr_irq_iar = ng.control_reg_interrupt_iar * 4
        _saxi.write(awaddr_irq_ier, 3)  # irq enable

        ng.sim.sw_rst(_saxi)

        print('# 0st software reset (during idle)')

        for i in range(100):
            pass

        irq_stat = _saxi.read(araddr_irq_isr)
        if irq_stat != 0:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()
        print('# irq stat = %d' %
              irq_stat)  # no irq busy by software reset when idle

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 1st test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        # software reset
        ng.sim.sw_rst(_saxi)

        print('# 1st software reset (before resume)')

        # from extern-send
        irq_join(_saxi, 1)

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send

        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_1st = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_1st = False
                # else:
                #    print('OK', i, j, orig, check)

        # 2nd test

        # start
        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 2nd test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        while (memory.waddr.awvalid) == 0:
            pass

        ng.sim.sw_rst(_saxi)

        print('# 2nd software reset (during Master AXI transaction)')

        irq_join(_saxi, 1)  # irq busy by software reset

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        araddr = ng.control_reg_extern_send * 4
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # termination
        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_2nd = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_2nd = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok_1st and ok_2nd:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Пример #3
0
def run(act_shape=(1, 7, 7, 15),
        act_dtype=ng.int32,
        out_dtype=ng.int32,
        factors=(1, 2, 2, 1),
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    tmp = ng.add(act, act, dtype=act_dtype, par=par)
    out = ng.upsampling2d(tmp, factors=factors, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out],
                            'matrix_add_upsampling2d',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape)

    vtmp = ng.verify.add(vact, vact, par=par, dtype=act_dtype)
    vout = ng.verify.upsampling2d(vtmp, factors=factors, dtype=out_dtype)

    # to memory image
    size_max = int(math.ceil(
        max(act.memory_size, out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))
    axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Пример #4
0
def run(a_shape=(15, 15), b_shape=(15, 15),
        a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32,
        par=1, axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    d = ng.add(a, b, dtype=c_dtype, par=par)
    e = ng.add(b, a, dtype=c_dtype, par=par)

    # SW returns ng.add(x, y)
    f = ng.extern([d, e], shape=a_shape, opcode=0x1)
    g = ng.sub(f, a)

    # SW returns d as-is
    h = ng.extern([g], shape=a_shape, opcode=0x2)
    c = ng.sub(h, b)

    targ = ng.to_veriloggen([c], 'matrix_extern', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16]

    vd = ng.verify.add(va, vb, dtype=c_dtype, par=par,
                       x_dtype=a_dtype, y_dtype=b_dtype)
    ve = ng.verify.add(vb, va, dtype=c_dtype, par=par,
                       x_dtype=b_dtype, y_dtype=a_dtype)

    vf = ng.verify.extern([vd, ve], shape=a_shape, opcode=0x1,
                          func=lambda x, y: x + y)
    vg = ng.verify.sub(vf, va,
                       x_dtype=c_dtype, y_dtype=c_dtype)

    vh = ng.verify.extern([vg], shape=a_shape, opcode=0x2,
                          func=lambda x: x)
    vc = ng.verify.sub(vh, vb,
                       x_dtype=c_dtype, y_dtype=c_dtype)

    # to memory image
    size_max = int(math.ceil(max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth,
                   a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth,
                   b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth,
                   c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg_datawidth=memimg_datawidth,
                                memimg=mem, memimg_name=memimg_name)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        # from extern-send
        araddr = ng.control_reg_extern_send * 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        awaddr = ng.control_reg_extern_recv * 4
        _saxi.write(awaddr, 1)

        # from extern-send
        araddr = ng.control_reg_extern_send * 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        awaddr = ng.control_reg_extern_recv * 4
        _saxi.write(awaddr, 1)

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j,
                                        c.addr, c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt