Пример #1
0
               scale=s1,
               act_func=ng.relu,
               dtype=act_dtype,
               sum_dtype=ng.int32)

a1r = ng.reshape(a1, [batchsize, -1])

# layer 2: full-connection, relu
w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2')
b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2')
s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2')

a2 = ng.matmul(a1r,
               w2,
               bias=b2,
               scale=s2,
               transposed_b=True,
               act_func=ng.relu,
               dtype=act_dtype,
               sum_dtype=ng.int32)

# layer 3: full-connection, relu
w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3')
b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3')
s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3')

# output
output_layer = ng.matmul(a2,
                         w3,
                         bias=b3,
                         scale=s3,
                         transposed_b=True,
Пример #2
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        bias_shape=None,
        scale_shape=None,
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        bias_dtype=ng.int32,
        scale_dtype=ng.int32,
        c_dtype=ng.int32,
        rshift_mul=None,
        rshift_sum=None,
        rshift_out=None,
        act_func=None,
        par_left_col=1,
        par_left_row=1,
        par_out_col=1,
        concur_out_col=None,
        stationary='right',
        left_ram_size=None,
        right_ram_size=None,
        bias_ram_size=None,
        scale_ram_size=None,
        out_ram_size=None,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    if bias_shape is not None:
        bias = ng.placeholder(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.placeholder(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    transposed_a = False
    transposed_b = True

    c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul,
                  rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                  ng.int32, 'matmul', par_left_col, par_left_row, par_out_col,
                  concur_out_col, stationary, left_ram_size, right_ram_size,
                  bias_ram_size, scale_ram_size, None, None, None,
                  out_ram_size)

    targ = ng.to_veriloggen([c],
                            'matrix_matmul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3]

    if bias is not None:
        vbias = np.arange(bias.length, dtype=np.int64).reshape(
            bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length, dtype=np.int64).reshape(
            scale.shape) % [6]
    else:
        vscale = None

    vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul,
                          rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                          ng.int32, 'matmul', par_left_col, par_left_row,
                          par_out_col, concur_out_col, stationary,
                          left_ram_size, right_ram_size, bias_ram_size,
                          scale_ram_size, None, None, None, out_ram_size,
                          False, a_dtype, b_dtype, bias_dtype, scale_dtype)

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size,
                bias.memory_size if bias is not None else 0, scale.memory_size
                if scale is not None else 0, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(
        mem, va, memimg_datawidth, a_dtype.width, a.addr,
        max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col))

    axi.set_memory(
        mem, vb, memimg_datawidth, b_dtype.width, b.addr,
        max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col))

    if bias is not None:
        axi.set_memory(
            mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr,
            max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col))

    if scale is not None:
        axi.set_memory(
            mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr,
            max(int(math.ceil(axi_datawidth / scale_dtype.width)),
                par_out_col))

    axi.set_memory(
        mem, vc, memimg_datawidth, c_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(c.shape[0]):
            for j in range(c.shape[1]):
                orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print(i, j, orig, check)
                    ok = False

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Пример #3
0
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28):
    # create target hardware

    # layer 0: conv2d, max_pool_serial, relu
    input_layer = ng.placeholder(ng.int32,
                                 shape=(1, row, col, ich),
                                 name='input_layer')
    w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0')
    a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1))
    a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))
    a0 = ng.relu(a0)

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(ng.int32,
                     shape=(ch, ksize, ksize, a0.shape[-1]),
                     name='w1')
    a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1))
    a1 = ng.relu(a1)
    a1 = ng.reshape(a1, [-1])

    # layer 2: full-connection
    w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2')
    a2 = ng.matmul(a1, w2, transposed_b=True)
    a2 = ng.relu(a2)

    # layer 3: full-connection
    w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3')
    output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer')

    targ = ng.to_veriloggen([output_layer], 'cnn')
    #targ = ng.to_ipxact([output_layer], 'cnn')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    return m
Пример #4
0
def run(
    act_dtype=ng.int8,
    weight_dtype=ng.int8,
    bias_dtype=ng.int32,
    scale_dtype=ng.int8,
    par_ich=2,
    par_och=2,
    chunk_size=64,
    axi_datawidth=32,
    silent=False,
    weight_filename='cnn.npy',
    verilog_filename=None,
    sim_filename=None,
    # simtype='iverilog',
    simtype='verilator',
    # simtype=None,  # no RTL simulation
):

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # input
    input_layer = ng.placeholder(
        dtype=act_dtype,
        shape=(1, 32, 32, 3),  # N, H, W, C
        name='input_layer')

    # layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool
    w0 = ng.variable(
        dtype=weight_dtype,
        shape=(64, 3, 3, 3),  # Och, Ky, Kx, Ich
        name='w0')
    b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0')
    s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0')

    a0 = ng.conv2d(input_layer,
                   w0,
                   strides=(1, 1, 1, 1),
                   bias=b0,
                   scale=s0,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(weight_dtype, shape=(64, 3, 3, a0.shape[-1]), name='w1')
    b1 = ng.variable(bias_dtype, shape=(w1.shape[0], ), name='b1')
    s1 = ng.variable(scale_dtype, shape=(w1.shape[0], ), name='s1')

    a1 = ng.conv2d(a0p,
                   w1,
                   strides=(1, 1, 1, 1),
                   bias=b1,
                   scale=s1,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a1r = ng.reshape(a1, [1, -1])

    # layer 2: full-connection, relu
    w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2')
    b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2')
    s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2')

    a2 = ng.matmul(a1r,
                   w2,
                   bias=b2,
                   scale=s2,
                   transposed_b=True,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    # layer 3: full-connection, relu
    w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3')
    b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3')
    s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3')

    # output
    output_layer = ng.matmul(a2,
                             w3,
                             bias=b3,
                             scale=s3,
                             transposed_b=True,
                             name='output_layer',
                             dtype=act_dtype,
                             sum_dtype=ng.int32)

    # --------------------
    # (2) Assign weights to the NNgen operators
    # --------------------

    # In this example, random floating-point values are assigned.
    # In a real case, you should assign actual weight values
    # obtianed by a training on DNN framework.

    # If you don't you NNgen's quantizer, you can assign integer weights to each tensor.

    w0_value = np.random.normal(size=w0.length).reshape(w0.shape)
    w0_value = np.clip(w0_value, -3.0, 3.0)
    w0.set_value(w0_value)

    b0_value = np.random.normal(size=b0.length).reshape(b0.shape)
    b0_value = np.clip(b0_value, -3.0, 3.0)
    b0.set_value(b0_value)

    s0_value = np.ones(s0.shape)
    s0.set_value(s0_value)

    w1_value = np.random.normal(size=w1.length).reshape(w1.shape)
    w1_value = np.clip(w1_value, -3.0, 3.0)
    w1.set_value(w1_value)

    b1_value = np.random.normal(size=b1.length).reshape(b1.shape)
    b1_value = np.clip(b1_value, -3.0, 3.0)
    b1.set_value(b1_value)

    s1_value = np.ones(s1.shape)
    s1.set_value(s1_value)

    w2_value = np.random.normal(size=w2.length).reshape(w2.shape)
    w2_value = np.clip(w2_value, -3.0, 3.0)
    w2.set_value(w2_value)

    b2_value = np.random.normal(size=b2.length).reshape(b2.shape)
    b2_value = np.clip(b2_value, -3.0, 3.0)
    b2.set_value(b2_value)

    s2_value = np.ones(s2.shape)
    s2.set_value(s2_value)

    w3_value = np.random.normal(size=w3.length).reshape(w3.shape)
    w3_value = np.clip(w3_value, -3.0, 3.0)
    w3.set_value(w3_value)

    b3_value = np.random.normal(size=b3.length).reshape(b3.shape)
    b3_value = np.clip(b3_value, -3.0, 3.0)
    b3.set_value(b3_value)

    s3_value = np.ones(s3.shape)
    s3.set_value(s3_value)

    # Quantizing the floating-point weights by the NNgen quantizer.
    # Alternatively, you can assign integer weights by yourself to each tensor.

    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'input_layer': act_scale_factor}
    input_means = {'input_layer': imagenet_mean * act_scale_factor}
    input_stds = {'input_layer': imagenet_std * act_scale_factor}

    ng.quantize([output_layer], input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    # conv2d, matmul
    # par_ich: parallelism in input-channel
    # par_och: parallelism in output-channel
    # par_col: parallelism in pixel column
    # par_row: parallelism in pixel row

    a0.attribute(par_ich=par_ich, par_och=par_och)
    a1.attribute(par_ich=par_ich, par_och=par_och)
    a2.attribute(par_ich=par_ich, par_och=par_och)
    output_layer.attribute(par_ich=par_ich, par_och=par_och)

    # cshamt_out: right shift amount after applying bias/scale
    # If you assign integer weights by yourself to each tensor,
    # cshamt (constant shift amount) must be assigned to each operator.

    # a0.attribute(cshamt_out=weight_dtype.width + 1)
    # a1.attribute(cshamt_out=weight_dtype.width + 1)
    # a2.attribute(cshamt_out=weight_dtype.width + 1)
    # output_layer.attribute(cshamt_out=weight_dtype.width + 1)

    # max_pool
    # par: parallelism in in/out channel

    par = par_och

    a0p.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    # In this example, random integer values are assigned.
    # In real case, you should assign actual integer activation values, such as an image.

    input_layer_value = np.random.normal(size=input_layer.length).reshape(
        input_layer.shape)
    input_layer_value = input_layer_value * imagenet_std + imagenet_mean
    input_layer_value = np.clip(input_layer_value, -5.0, 5.0)
    input_layer_value = input_layer_value * act_scale_factor
    input_layer_value = np.clip(input_layer_value,
                                -1 * 2**(act_dtype.width - 1) - 1,
                                2**(act_dtype.width - 1))
    input_layer_value = np.round(input_layer_value).astype(np.int64)

    eval_outs = ng.eval([output_layer], input_layer=input_layer_value)
    output_layer_value = eval_outs[0]

    # print(output_layer_value)
    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([output_layer], 'cnn', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([output_layer],
                        'cnn',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([output_layer], 'cnn', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Save the quantized weights
    # --------------------

    # convert weight values to a memory image:
    # on a real FPGA platform, this image will be used as a part of the model definition.

    param_filename = 'hello_nngen.npy'
    chunk_size = 64

    param_data = ng.export_ndarray([output_layer], chunk_size)
    np.save(weight_filename, param_data)

    # --------------------
    # (7) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil((input_layer.addr + input_layer.memory_size) /
                  chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(
        math.ceil(
            (check_addr + output_layer.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, input_layer_value, memimg_datawidth, act_dtype.width,
        input_layer.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, output_layer_value, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if sim_filename is None:
        sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + sim_filename

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(output_layer.shape[0]):
            for x in range(output_layer.shape[1]):
                orig = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, output_layer.addr,
                    act_dtype.width)
                check = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, check_addr,
                    act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                else:
                    print('OK (', bat, x, ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if verilog_filename is not None:
        m.to_verilog(verilog_filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=sim_filename)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Пример #5
0
def mkTest(n_input=784, n_classes=10):
    # create target hardware
    x = ng.placeholder(ng.int32, shape=[n_input])

    w1 = ng.variable(ng.int32, shape=(n_input, n_input), name='h1')
    w2 = ng.variable(ng.int32, shape=(n_input, n_input), name='h2')
    w3 = ng.variable(ng.int32, shape=(n_classes, n_input), name='out')

    l1 = ng.matmul(x, w1, transposed_b=True)
    l1 = ng.relu(l1)

    l2 = ng.matmul(l1, w2, transposed_b=True)
    l2 = ng.relu(l2)

    out = ng.matmul(l2, w3, transposed_b=True)

    targ = ng.to_veriloggen([out], 'mlp')
    #targ = ng.to_ipxact([model], 'mlp')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m