def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15),
        bias_shape=None, scale_shape=None,
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        conv2d_stride=(1, 1, 1, 1),
        rshift_mul=None, rshift_sum=None, rshift_out=None,
        act_func=None,
        par_ich=1, par_och=1, par_col=1, par_row=1,
        concur_och=None, stationary='filter',
        input_ram_size=None, filter_ram_size=None,
        bias_ram_size=None, scale_ram_size=None,
        out_ram_size=None,
        ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    weight = ng.variable(weight_dtype, shape=weight_shape, name='weight')

    if bias_shape is not None:
        bias = ng.variable(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.variable(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    tmp = ng.conv2d(act, weight, conv2d_stride,
                    bias, scale,
                    rshift_mul, rshift_sum, rshift_out,
                    act_func, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d',
                    par_ich, par_och, par_col, par_row,
                    concur_och, stationary,
                    input_ram_size, filter_ram_size,
                    bias_ram_size, scale_ram_size,
                    None, None, None,
                    out_ram_size)

    out = ng.avg_pool(tmp, ksize=ksize,
                      strides=pool_stride,
                      sum_dtype=ng.int32, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]
    vweight = np.arange(weight.length,
                        dtype=np.int64).reshape(weight.shape) % [32] - [16]

    if bias is not None:
        vbias = np.arange(bias.length,
                          dtype=np.int64).reshape(bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length,
                           dtype=np.int64).reshape(scale.shape) % [6]
    else:
        vscale = None

    eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale)
    vout = eval_outs[0]

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight.memory_size,
                                 bias.memory_size if bias is not None else 0,
                                 scale.memory_size if scale is not None else 0,
                                 out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, weight.addr,
                     bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    axi.set_memory(mem, vweight, memimg_datawidth,
                   weight_dtype.width, weight.addr,
                   max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich))

    if bias is not None:
        axi.set_memory(mem, vbias, memimg_datawidth,
                       bias_dtype.width, bias.addr,
                       max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och))

    if scale is not None:
        axi.set_memory(mem, vscale, memimg_datawidth,
                       scale_dtype.width, scale.addr,
                       max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                + x * out.aligned_shape[3] + ch,
                                                out.addr, out_dtype.width)
                        check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + x * out.aligned_shape[3] + ch,
                                                 check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
示例#2
0
# data types
act_dtype = ng.int8
weight_dtype = ng.int8
bias_dtype = ng.int32
scale_dtype = ng.int8
batchsize = 1

# input
input_layer = ng.placeholder(
    dtype=act_dtype,
    shape=(batchsize, 32, 32, 3),  # N, H, W, C
    name='input_layer')

# layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool
w0 = ng.variable(
    dtype=weight_dtype,
    shape=(64, 3, 3, 3),  # Och, Ky, Kx, Ich
    name='w0')
b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0')
s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0')

a0 = ng.conv2d(input_layer,
               w0,
               strides=(1, 1, 1, 1),
               bias=b0,
               scale=s0,
               act_func=ng.relu,
               dtype=act_dtype,
               sum_dtype=ng.int32)

a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))
def run(act_shape=(1, 7, 7, 15),
        weight1_shape=(7, 3, 3, 15), bias1_shape=None, scale1_shape=None,
        weight2_shape=(9, 3, 3, 7), bias2_shape=None, scale2_shape=None,
        act_dtype=ng.int32,
        weight1_dtype=ng.int32, bias1_dtype=ng.int32, scale1_dtype=ng.int32,
        weight2_dtype=ng.int32, bias2_dtype=ng.int32, scale2_dtype=ng.int32,
        tmp_dtype=ng.int32,
        out_dtype=ng.int32,
        stride1=(1, 1, 1, 1), stride2=(1, 1, 1, 1),
        rshift_mul1=None, rshift_sum1=None, rshift_out1=None,
        rshift_mul2=None, rshift_sum2=None, rshift_out2=None,
        act_func1=None, act_func2=None,
        par_ich1=1, par_och1=1, par_col1=1, par_row1=1,
        concur_och1=None, stationary1='filter',
        par_ich2=1, par_och2=1, par_col2=1, par_row2=1,
        concur_och2=None, stationary2='filter',
        input_ram_size1=None, filter_ram_size1=None,
        bias_ram_size1=None, scale_ram_size1=None,
        out_ram_size1=None,
        input_ram_size2=None, filter_ram_size2=None,
        bias_ram_size2=None, scale_ram_size2=None,
        out_ram_size2=None,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')

    weight1 = ng.variable(weight1_dtype, shape=weight1_shape,
                          name='weight1')

    if bias1_shape is not None:
        bias1 = ng.variable(bias1_dtype, bias1_shape, name='bias1')
    else:
        bias1 = None

    if scale1_shape is not None:
        scale1 = ng.variable(scale1_dtype, scale1_shape, name='scale1')
    else:
        scale1 = None

    weight2 = ng.variable(weight2_dtype, shape=weight2_shape,
                          name='weight2')

    if bias2_shape is not None:
        bias2 = ng.variable(bias2_dtype, bias2_shape, name='bias2')
    else:
        bias2 = None

    if scale2_shape is not None:
        scale2 = ng.variable(scale2_dtype, scale2_shape, name='scale2')
    else:
        scale2 = None

    tmp = ng.conv2d(act, weight1, stride1,
                    bias1, scale1,
                    rshift_mul1, rshift_sum1, rshift_out1,
                    act_func1, 'SAME',
                    tmp_dtype, ng.int32, ng.int32,
                    'conv2d_1',
                    par_ich1, par_och1, par_col1, par_row1,
                    concur_och1, stationary1,
                    input_ram_size1, filter_ram_size1,
                    bias_ram_size1, scale_ram_size1,
                    None, None, None,
                    out_ram_size1)

    out = ng.conv2d(tmp, weight2, stride2,
                    bias2, scale2,
                    rshift_mul2, rshift_sum2, rshift_out2,
                    act_func2, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d_2',
                    par_ich2, par_och2, par_col2, par_row2,
                    concur_och2, stationary2,
                    input_ram_size2, filter_ram_size2,
                    bias_ram_size2, scale_ram_size2,
                    None, None, None,
                    out_ram_size2)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_conv2d_variable', silent=silent,
                            config={'maxi_datawidth': axi_datawidth,
                                    'offchipram_chunk_bytes': chunk_size})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]

    vweight1 = np.arange(weight1.length,
                         dtype=np.int64).reshape(weight1_shape) % [32] - [16]

    if bias1 is not None:
        vbias1 = np.arange(bias1.length,
                           dtype=np.int64).reshape(bias1.shape) % [16]
    else:
        vbias1 = None

    if scale1 is not None:
        vscale1 = np.arange(scale1.length,
                            dtype=np.int64).reshape(scale1.shape) % [8]
    else:
        vscale1 = None

    vweight2 = np.arange(weight2.length,
                         dtype=np.int64).reshape(weight2_shape) % [32] - [16]

    if bias2 is not None:
        vbias2 = np.arange(bias2.length,
                           dtype=np.int64).reshape(bias2.shape) % [16]
    else:
        vbias2 = None

    if scale2 is not None:
        vscale2 = np.arange(scale2.length,
                            dtype=np.int64).reshape(scale2.shape) % [8]
    else:
        vscale2 = None

    vtmp = ng.verify.conv2d(vact, vweight1, stride1,
                            vbias1, vscale1,
                            rshift_mul1, rshift_sum1, rshift_out1,
                            act_func1, 'SAME',
                            tmp_dtype, ng.int32, ng.int32,
                            'conv2d_1',
                            par_ich1, par_och1, par_col1, par_row1,
                            concur_och1, stationary1,
                            input_ram_size1, filter_ram_size1,
                            bias_ram_size1, scale_ram_size1,
                            None, None, None,
                            out_ram_size1,
                            False,
                            act_dtype, weight1_dtype)

    vout = ng.verify.conv2d(vtmp, vweight2, stride2,
                            vbias2, vscale2,
                            rshift_mul2, rshift_sum2, rshift_out2,
                            act_func2, 'SAME',
                            out_dtype, ng.int32, ng.int32,
                            'conv2d_2',
                            par_ich2, par_och2, par_col2, par_row2,
                            concur_och2, stationary2,
                            input_ram_size2, filter_ram_size2,
                            bias_ram_size2, scale_ram_size2,
                            None, None, None,
                            out_ram_size2,
                            False,
                            tmp_dtype, weight2_dtype)

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight1.memory_size,
                                 bias1.memory_size if bias1 is not None else 0,
                                 scale1.memory_size if scale1 is not None else 0,
                                 weight2.memory_size,
                                 bias2.memory_size if bias2 is not None else 0,
                                 scale2.memory_size if scale2 is not None else 0,
                                 out.memory_size) / chunk_size)) * chunk_size

    # assign custom addresses
    variable_addr = max(act.addr, out.addr) + size_max

    weight1_addr = variable_addr
    bias1_addr = weight1_addr + int(math.ceil(weight1.memory_size / chunk_size)) * chunk_size
    scale1_addr = (bias1_addr + int(math.ceil(bias1.memory_size / chunk_size)) * chunk_size
                   if bias1 is not None else weight1_addr)

    weight2_addr = (scale1_addr + int(math.ceil(scale1.memory_size / chunk_size)) * chunk_size
                    if scale1 is not None else bias1_addr)
    bias2_addr = weight2_addr + int(math.ceil(weight2.memory_size / chunk_size)) * chunk_size
    scale2_addr = (bias2_addr + int(math.ceil(bias2.memory_size / chunk_size)) * chunk_size
                   if bias2 is not None else weight2_addr)

    check_addr = scale2_addr + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich1))

    axi.set_memory(mem, vweight1, memimg_datawidth,
                   weight1_dtype.width, weight1_addr,
                   max(int(math.ceil(axi_datawidth / weight1_dtype.width)), par_ich1))
    if bias1_shape is not None:
        axi.set_memory(mem, vbias1, memimg_datawidth,
                       bias1_dtype.width, bias1_addr,
                       max(int(math.ceil(axi_datawidth / bias1_dtype.width)), par_och1))
    if scale1_shape is not None:
        axi.set_memory(mem, vscale1, memimg_datawidth,
                       scale1_dtype.width, scale1_addr,
                       max(int(math.ceil(axi_datawidth / scale1_dtype.width)), par_och1))

    axi.set_memory(mem, vweight2, memimg_datawidth,
                   weight2_dtype.width, weight2_addr,
                   max(int(math.ceil(axi_datawidth / weight2_dtype.width)), par_ich2))
    if bias2_shape is not None:
        axi.set_memory(mem, vbias2, memimg_datawidth,
                       bias2_dtype.width, bias2_addr,
                       max(int(math.ceil(axi_datawidth / bias2_dtype.width)), par_och2))
    if scale2_shape is not None:
        axi.set_memory(mem, vscale2, memimg_datawidth,
                       scale2_dtype.width, scale2_addr,
                       max(int(math.ceil(axi_datawidth / scale2_dtype.width)), par_och2))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och2))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        # set custom addresses
        ng.sim.set_global_addrs(_saxi, tmp_addr, out.addr, act.addr, variable_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)
                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
示例#4
0
def run(
    act_dtype=ng.int8,
    weight_dtype=ng.int8,
    bias_dtype=ng.int32,
    scale_dtype=ng.int8,
    par_ich=2,
    par_och=2,
    chunk_size=64,
    axi_datawidth=32,
    silent=False,
    weight_filename='cnn.npy',
    verilog_filename=None,
    sim_filename=None,
    # simtype='iverilog',
    simtype='verilator',
    # simtype=None,  # no RTL simulation
):

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # input
    input_layer = ng.placeholder(
        dtype=act_dtype,
        shape=(1, 32, 32, 3),  # N, H, W, C
        name='input_layer')

    # layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool
    w0 = ng.variable(
        dtype=weight_dtype,
        shape=(64, 3, 3, 3),  # Och, Ky, Kx, Ich
        name='w0')
    b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0')
    s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0')

    a0 = ng.conv2d(input_layer,
                   w0,
                   strides=(1, 1, 1, 1),
                   bias=b0,
                   scale=s0,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(weight_dtype, shape=(64, 3, 3, a0.shape[-1]), name='w1')
    b1 = ng.variable(bias_dtype, shape=(w1.shape[0], ), name='b1')
    s1 = ng.variable(scale_dtype, shape=(w1.shape[0], ), name='s1')

    a1 = ng.conv2d(a0p,
                   w1,
                   strides=(1, 1, 1, 1),
                   bias=b1,
                   scale=s1,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a1r = ng.reshape(a1, [1, -1])

    # layer 2: full-connection, relu
    w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2')
    b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2')
    s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2')

    a2 = ng.matmul(a1r,
                   w2,
                   bias=b2,
                   scale=s2,
                   transposed_b=True,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    # layer 3: full-connection, relu
    w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3')
    b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3')
    s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3')

    # output
    output_layer = ng.matmul(a2,
                             w3,
                             bias=b3,
                             scale=s3,
                             transposed_b=True,
                             name='output_layer',
                             dtype=act_dtype,
                             sum_dtype=ng.int32)

    # --------------------
    # (2) Assign weights to the NNgen operators
    # --------------------

    # In this example, random floating-point values are assigned.
    # In a real case, you should assign actual weight values
    # obtianed by a training on DNN framework.

    # If you don't you NNgen's quantizer, you can assign integer weights to each tensor.

    w0_value = np.random.normal(size=w0.length).reshape(w0.shape)
    w0_value = np.clip(w0_value, -3.0, 3.0)
    w0.set_value(w0_value)

    b0_value = np.random.normal(size=b0.length).reshape(b0.shape)
    b0_value = np.clip(b0_value, -3.0, 3.0)
    b0.set_value(b0_value)

    s0_value = np.ones(s0.shape)
    s0.set_value(s0_value)

    w1_value = np.random.normal(size=w1.length).reshape(w1.shape)
    w1_value = np.clip(w1_value, -3.0, 3.0)
    w1.set_value(w1_value)

    b1_value = np.random.normal(size=b1.length).reshape(b1.shape)
    b1_value = np.clip(b1_value, -3.0, 3.0)
    b1.set_value(b1_value)

    s1_value = np.ones(s1.shape)
    s1.set_value(s1_value)

    w2_value = np.random.normal(size=w2.length).reshape(w2.shape)
    w2_value = np.clip(w2_value, -3.0, 3.0)
    w2.set_value(w2_value)

    b2_value = np.random.normal(size=b2.length).reshape(b2.shape)
    b2_value = np.clip(b2_value, -3.0, 3.0)
    b2.set_value(b2_value)

    s2_value = np.ones(s2.shape)
    s2.set_value(s2_value)

    w3_value = np.random.normal(size=w3.length).reshape(w3.shape)
    w3_value = np.clip(w3_value, -3.0, 3.0)
    w3.set_value(w3_value)

    b3_value = np.random.normal(size=b3.length).reshape(b3.shape)
    b3_value = np.clip(b3_value, -3.0, 3.0)
    b3.set_value(b3_value)

    s3_value = np.ones(s3.shape)
    s3.set_value(s3_value)

    # Quantizing the floating-point weights by the NNgen quantizer.
    # Alternatively, you can assign integer weights by yourself to each tensor.

    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'input_layer': act_scale_factor}
    input_means = {'input_layer': imagenet_mean * act_scale_factor}
    input_stds = {'input_layer': imagenet_std * act_scale_factor}

    ng.quantize([output_layer], input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    # conv2d, matmul
    # par_ich: parallelism in input-channel
    # par_och: parallelism in output-channel
    # par_col: parallelism in pixel column
    # par_row: parallelism in pixel row

    a0.attribute(par_ich=par_ich, par_och=par_och)
    a1.attribute(par_ich=par_ich, par_och=par_och)
    a2.attribute(par_ich=par_ich, par_och=par_och)
    output_layer.attribute(par_ich=par_ich, par_och=par_och)

    # cshamt_out: right shift amount after applying bias/scale
    # If you assign integer weights by yourself to each tensor,
    # cshamt (constant shift amount) must be assigned to each operator.

    # a0.attribute(cshamt_out=weight_dtype.width + 1)
    # a1.attribute(cshamt_out=weight_dtype.width + 1)
    # a2.attribute(cshamt_out=weight_dtype.width + 1)
    # output_layer.attribute(cshamt_out=weight_dtype.width + 1)

    # max_pool
    # par: parallelism in in/out channel

    par = par_och

    a0p.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    # In this example, random integer values are assigned.
    # In real case, you should assign actual integer activation values, such as an image.

    input_layer_value = np.random.normal(size=input_layer.length).reshape(
        input_layer.shape)
    input_layer_value = input_layer_value * imagenet_std + imagenet_mean
    input_layer_value = np.clip(input_layer_value, -5.0, 5.0)
    input_layer_value = input_layer_value * act_scale_factor
    input_layer_value = np.clip(input_layer_value,
                                -1 * 2**(act_dtype.width - 1) - 1,
                                2**(act_dtype.width - 1))
    input_layer_value = np.round(input_layer_value).astype(np.int64)

    eval_outs = ng.eval([output_layer], input_layer=input_layer_value)
    output_layer_value = eval_outs[0]

    # print(output_layer_value)
    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([output_layer], 'cnn', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([output_layer],
                        'cnn',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([output_layer], 'cnn', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Save the quantized weights
    # --------------------

    # convert weight values to a memory image:
    # on a real FPGA platform, this image will be used as a part of the model definition.

    param_filename = 'hello_nngen.npy'
    chunk_size = 64

    param_data = ng.export_ndarray([output_layer], chunk_size)
    np.save(weight_filename, param_data)

    # --------------------
    # (7) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil((input_layer.addr + input_layer.memory_size) /
                  chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(
        math.ceil(
            (check_addr + output_layer.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, input_layer_value, memimg_datawidth, act_dtype.width,
        input_layer.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, output_layer_value, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if sim_filename is None:
        sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + sim_filename

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(output_layer.shape[0]):
            for x in range(output_layer.shape[1]):
                orig = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, output_layer.addr,
                    act_dtype.width)
                check = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, check_addr,
                    act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                else:
                    print('OK (', bat, x, ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if verilog_filename is not None:
        m.to_verilog(verilog_filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=sim_filename)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
示例#5
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    #a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    a = ng.variable(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')
    c = ng.sub(a, b, dtype=c_dtype, par=par, name='c')

    targ = ng.to_veriloggen([c],
                            'matrix_sub',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6]

    vc = ng.verify.sub(va, vb, dtype=c_dtype, x_dtype=a_dtype, y_dtype=b_dtype)

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
示例#6
0
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28):
    # create target hardware

    # layer 0: conv2d, max_pool_serial, relu
    input_layer = ng.placeholder(ng.int32,
                                 shape=(1, row, col, ich),
                                 name='input_layer')
    w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0')
    a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1))
    a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))
    a0 = ng.relu(a0)

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(ng.int32,
                     shape=(ch, ksize, ksize, a0.shape[-1]),
                     name='w1')
    a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1))
    a1 = ng.relu(a1)
    a1 = ng.reshape(a1, [-1])

    # layer 2: full-connection
    w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2')
    a2 = ng.matmul(a1, w2, transposed_b=True)
    a2 = ng.relu(a2)

    # layer 3: full-connection
    w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3')
    output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer')

    targ = ng.to_veriloggen([output_layer], 'cnn')
    #targ = ng.to_ipxact([output_layer], 'cnn')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    return m
示例#7
0
    'header3': 264196,
}

IPNAME = 'dlaip'

operators = OrderedDict()
placeholders = OrderedDict()
variables = OrderedDict()

# graph

L002_Scale_name = 'L002_Scale'
L003_layer1_conv_cv_cbs_name = 'L003_layer1_conv_cv_cbs'
L003_layer1_conv_cv_cbs_name_filter = 'L003_layer1_conv_cv_cbs_convolution_weight'
L003_layer1_conv_cv_cbs_filter = ng.variable(
    ng.dtype_int(width=8),
    shape=[8, 3, 3, 3],
    name=L003_layer1_conv_cv_cbs_name_filter)
# - bias & scale -
L003_layer1_conv_cv_cbs_name_bias = 'L003_layer1_conv_cv_cbs_convolution_bias'
L003_layer1_conv_cv_cbs_name_scale = 'L003_layer1_conv_cv_cbs_i2f_i2f_scale'
L003_layer1_conv_cv_cbs_bias = ng.variable(
    dtype=ng.int32, shape=[8], name=L003_layer1_conv_cv_cbs_name_bias)
L003_layer1_conv_cv_cbs_scale = ng.variable(
    dtype=ng.int16, shape=[1], name=L003_layer1_conv_cv_cbs_name_scale)
# - rshift_out -
L003_layer1_conv_cv_cbs_name_rshift_out = 'L003_layer1_conv_cv_cbs_i2f_rshift_out'
L003_layer1_conv_cv_cbs_rshift_out = ng.variable(
    ng.dtype_int(width=8),
    shape=[8],
    name=L003_layer1_conv_cv_cbs_name_rshift_out)
variables[
示例#8
0
def mkTest(n_input=784, n_classes=10):
    # create target hardware
    x = ng.placeholder(ng.int32, shape=[n_input])

    w1 = ng.variable(ng.int32, shape=(n_input, n_input), name='h1')
    w2 = ng.variable(ng.int32, shape=(n_input, n_input), name='h2')
    w3 = ng.variable(ng.int32, shape=(n_classes, n_input), name='out')

    l1 = ng.matmul(x, w1, transposed_b=True)
    l1 = ng.relu(l1)

    l2 = ng.matmul(l1, w2, transposed_b=True)
    l2 = ng.relu(l2)

    out = ng.matmul(l2, w3, transposed_b=True)

    targ = ng.to_veriloggen([out], 'mlp')
    #targ = ng.to_ipxact([model], 'mlp')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m