Exemplo n.º 1
0
def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    model = MatrixMul()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_mul.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    input_scale_factors = {'a': 10.0, 'b': 15.0}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.scaled_multiply):
            op.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    # verification data
    input_a = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [17]
    input_b = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
               [100]) % [13]

    # execution on pytorch
    model_a = input_a.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)

    model_b = input_b.astype(np.float32)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    # software-based verification
    va = input_a * input_scale_factors['a']
    va = np.clip(va, -1.0 * (2**(a.dtype.width - 1) - 1),
                 1.0 * (2**(a.dtype.width - 1) - 1))
    va = np.round(va).astype(np.int64)

    vb = input_b * input_scale_factors['b']
    vb = np.clip(vb, -1.0 * (2**(b.dtype.width - 1) - 1),
                 1.0 * (2**(b.dtype.width - 1) - 1))
    vb = np.round(vb).astype(np.int64)

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    mean_square_error = np.sum((vc - scaled_model_c)**2) / vc.size
    corrcoef = np.corrcoef(model_c.reshape([-1]), vc.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_mul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([c])
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 2
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')
    t = ng.add(a, b, dtype=c_dtype, par=par)
    c = ng.relu(t, dtype=c_dtype, par=par)

    targ = ng.to_veriloggen([c],
                            'matrix_add_relu',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
          [100]) % [6] - [10]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(act_shape=(1, 4, 4, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 36),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        stride0=1,
        padding0=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='ReLU',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 is not None:
        layers.append(getattr(nn, act_func0)())

    class Transpose(nn.Module):
        def __init__(self, perm):
            super(Transpose, self).__init__()
            self.perm = perm

        def forward(self, input):
            return input.permute(*self.perm)

    layers.append(Transpose([0, 1, 3, 2]))

    class Flatten(nn.Module):
        def forward(self, input):
            # return input.view(input.size(0), -1)
            return torch.reshape(input, (input.size(0), -1))

    layers.append(Flatten())
    layers.append(nn.Linear(weight1_shape[1], weight1_shape[0]))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 is not None:
        layers.append(getattr(nn, act_func1)())

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_transpose_linear.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '3.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    # random data
    std = 0.2
    mean = 0.5
    img = np.random.normal(size=act.length).astype(np.float32).reshape(
        act.shape)
    img = img * std + mean

    # execution on pytorch
    model_input = img

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_transpose_linear',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(out.shape[0]):
            for j in range(out.shape[1]):
                orig = memory.read_word(i * out.aligned_shape[1] + j, out.addr,
                                        act_dtype.width)
                check = memory.read_word(i * out.aligned_shape[1] + j,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', i, j, ') orig: ', orig, 'check: ', check)
                    ok = False
                # else:
                #    print('OK (', i, j, ') orig: ', orig, 'check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15),
        bias_shape=None, scale_shape=None,
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        conv2d_stride=(1, 1, 1, 1),
        rshift_mul=None, rshift_sum=None, rshift_out=None,
        act_func=None,
        par_ich=1, par_och=1, par_col=1, par_row=1,
        concur_och=None, stationary='filter',
        input_ram_size=None, filter_ram_size=None,
        bias_ram_size=None, scale_ram_size=None,
        out_ram_size=None,
        ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    weight = ng.variable(weight_dtype, shape=weight_shape, name='weight')

    if bias_shape is not None:
        bias = ng.variable(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.variable(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    tmp = ng.conv2d(act, weight, conv2d_stride,
                    bias, scale,
                    rshift_mul, rshift_sum, rshift_out,
                    act_func, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d',
                    par_ich, par_och, par_col, par_row,
                    concur_och, stationary,
                    input_ram_size, filter_ram_size,
                    bias_ram_size, scale_ram_size,
                    None, None, None,
                    out_ram_size)

    out = ng.avg_pool(tmp, ksize=ksize,
                      strides=pool_stride,
                      sum_dtype=ng.int32, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]
    vweight = np.arange(weight.length,
                        dtype=np.int64).reshape(weight.shape) % [32] - [16]

    if bias is not None:
        vbias = np.arange(bias.length,
                          dtype=np.int64).reshape(bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length,
                           dtype=np.int64).reshape(scale.shape) % [6]
    else:
        vscale = None

    eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale)
    vout = eval_outs[0]

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight.memory_size,
                                 bias.memory_size if bias is not None else 0,
                                 scale.memory_size if scale is not None else 0,
                                 out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, weight.addr,
                     bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    axi.set_memory(mem, vweight, memimg_datawidth,
                   weight_dtype.width, weight.addr,
                   max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich))

    if bias is not None:
        axi.set_memory(mem, vbias, memimg_datawidth,
                       bias_dtype.width, bias.addr,
                       max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och))

    if scale is not None:
        axi.set_memory(mem, vscale, memimg_datawidth,
                       scale_dtype.width, scale.addr,
                       max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                + x * out.aligned_shape[3] + ch,
                                                out.addr, out_dtype.width)
                        check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + x * out.aligned_shape[3] + ch,
                                                 check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 5
0
def run(act_shape=(1, 7, 7, 3),
        act_dtype=ng.int32,
        ksize=2, stride=2, padding=0,
        par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # pytorch model
    layers = []
    layers.append(nn.AvgPool2d(ksize, stride=stride, padding=padding))

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_avg_pool.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {'act': act_dtype,
                    'out': act_dtype}

    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=value_dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=ng.int32,
                                          default_constant_dtype=ng.int32,
                                          default_operator_dtype=act_dtype,
                                          default_scale_dtype=ng.int32,
                                          default_bias_dtype=ng.int32,
                                          disable_fusion=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    input_scale_factors = {'act': 1.0}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.avg_pool):
            op.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    if act_dtype.width > 4:
        vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1]
    else:
        vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1]

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # software-based verification
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None:
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    mean_square_error = np.sum((vout - scaled_model_out) ** 2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([out], 'onnx_matrix_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth,
                                    'chunk_size': chunk_size})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out])
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            out.addr, act_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            check_addr, act_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(act_shape=(1, 7, 7, 15),
        weight1_shape=(7, 3, 3, 15), bias1_shape=None, scale1_shape=None,
        weight2_shape=(9, 3, 3, 7), bias2_shape=None, scale2_shape=None,
        act_dtype=ng.int32,
        weight1_dtype=ng.int32, bias1_dtype=ng.int32, scale1_dtype=ng.int32,
        weight2_dtype=ng.int32, bias2_dtype=ng.int32, scale2_dtype=ng.int32,
        tmp_dtype=ng.int32,
        out_dtype=ng.int32,
        stride1=(1, 1, 1, 1), stride2=(1, 1, 1, 1),
        rshift_mul1=None, rshift_sum1=None, rshift_out1=None,
        rshift_mul2=None, rshift_sum2=None, rshift_out2=None,
        act_func1=None, act_func2=None,
        par_ich1=1, par_och1=1, par_col1=1, par_row1=1,
        concur_och1=None, stationary1='filter',
        par_ich2=1, par_och2=1, par_col2=1, par_row2=1,
        concur_och2=None, stationary2='filter',
        input_ram_size1=None, filter_ram_size1=None,
        bias_ram_size1=None, scale_ram_size1=None,
        out_ram_size1=None,
        input_ram_size2=None, filter_ram_size2=None,
        bias_ram_size2=None, scale_ram_size2=None,
        out_ram_size2=None,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')

    weight1 = ng.variable(weight1_dtype, shape=weight1_shape,
                          name='weight1')

    if bias1_shape is not None:
        bias1 = ng.variable(bias1_dtype, bias1_shape, name='bias1')
    else:
        bias1 = None

    if scale1_shape is not None:
        scale1 = ng.variable(scale1_dtype, scale1_shape, name='scale1')
    else:
        scale1 = None

    weight2 = ng.variable(weight2_dtype, shape=weight2_shape,
                          name='weight2')

    if bias2_shape is not None:
        bias2 = ng.variable(bias2_dtype, bias2_shape, name='bias2')
    else:
        bias2 = None

    if scale2_shape is not None:
        scale2 = ng.variable(scale2_dtype, scale2_shape, name='scale2')
    else:
        scale2 = None

    tmp = ng.conv2d(act, weight1, stride1,
                    bias1, scale1,
                    rshift_mul1, rshift_sum1, rshift_out1,
                    act_func1, 'SAME',
                    tmp_dtype, ng.int32, ng.int32,
                    'conv2d_1',
                    par_ich1, par_och1, par_col1, par_row1,
                    concur_och1, stationary1,
                    input_ram_size1, filter_ram_size1,
                    bias_ram_size1, scale_ram_size1,
                    None, None, None,
                    out_ram_size1)

    out = ng.conv2d(tmp, weight2, stride2,
                    bias2, scale2,
                    rshift_mul2, rshift_sum2, rshift_out2,
                    act_func2, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d_2',
                    par_ich2, par_och2, par_col2, par_row2,
                    concur_och2, stationary2,
                    input_ram_size2, filter_ram_size2,
                    bias_ram_size2, scale_ram_size2,
                    None, None, None,
                    out_ram_size2)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_conv2d_variable', silent=silent,
                            config={'maxi_datawidth': axi_datawidth,
                                    'offchipram_chunk_bytes': chunk_size})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]

    vweight1 = np.arange(weight1.length,
                         dtype=np.int64).reshape(weight1_shape) % [32] - [16]

    if bias1 is not None:
        vbias1 = np.arange(bias1.length,
                           dtype=np.int64).reshape(bias1.shape) % [16]
    else:
        vbias1 = None

    if scale1 is not None:
        vscale1 = np.arange(scale1.length,
                            dtype=np.int64).reshape(scale1.shape) % [8]
    else:
        vscale1 = None

    vweight2 = np.arange(weight2.length,
                         dtype=np.int64).reshape(weight2_shape) % [32] - [16]

    if bias2 is not None:
        vbias2 = np.arange(bias2.length,
                           dtype=np.int64).reshape(bias2.shape) % [16]
    else:
        vbias2 = None

    if scale2 is not None:
        vscale2 = np.arange(scale2.length,
                            dtype=np.int64).reshape(scale2.shape) % [8]
    else:
        vscale2 = None

    vtmp = ng.verify.conv2d(vact, vweight1, stride1,
                            vbias1, vscale1,
                            rshift_mul1, rshift_sum1, rshift_out1,
                            act_func1, 'SAME',
                            tmp_dtype, ng.int32, ng.int32,
                            'conv2d_1',
                            par_ich1, par_och1, par_col1, par_row1,
                            concur_och1, stationary1,
                            input_ram_size1, filter_ram_size1,
                            bias_ram_size1, scale_ram_size1,
                            None, None, None,
                            out_ram_size1,
                            False,
                            act_dtype, weight1_dtype)

    vout = ng.verify.conv2d(vtmp, vweight2, stride2,
                            vbias2, vscale2,
                            rshift_mul2, rshift_sum2, rshift_out2,
                            act_func2, 'SAME',
                            out_dtype, ng.int32, ng.int32,
                            'conv2d_2',
                            par_ich2, par_och2, par_col2, par_row2,
                            concur_och2, stationary2,
                            input_ram_size2, filter_ram_size2,
                            bias_ram_size2, scale_ram_size2,
                            None, None, None,
                            out_ram_size2,
                            False,
                            tmp_dtype, weight2_dtype)

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight1.memory_size,
                                 bias1.memory_size if bias1 is not None else 0,
                                 scale1.memory_size if scale1 is not None else 0,
                                 weight2.memory_size,
                                 bias2.memory_size if bias2 is not None else 0,
                                 scale2.memory_size if scale2 is not None else 0,
                                 out.memory_size) / chunk_size)) * chunk_size

    # assign custom addresses
    variable_addr = max(act.addr, out.addr) + size_max

    weight1_addr = variable_addr
    bias1_addr = weight1_addr + int(math.ceil(weight1.memory_size / chunk_size)) * chunk_size
    scale1_addr = (bias1_addr + int(math.ceil(bias1.memory_size / chunk_size)) * chunk_size
                   if bias1 is not None else weight1_addr)

    weight2_addr = (scale1_addr + int(math.ceil(scale1.memory_size / chunk_size)) * chunk_size
                    if scale1 is not None else bias1_addr)
    bias2_addr = weight2_addr + int(math.ceil(weight2.memory_size / chunk_size)) * chunk_size
    scale2_addr = (bias2_addr + int(math.ceil(bias2.memory_size / chunk_size)) * chunk_size
                   if bias2 is not None else weight2_addr)

    check_addr = scale2_addr + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich1))

    axi.set_memory(mem, vweight1, memimg_datawidth,
                   weight1_dtype.width, weight1_addr,
                   max(int(math.ceil(axi_datawidth / weight1_dtype.width)), par_ich1))
    if bias1_shape is not None:
        axi.set_memory(mem, vbias1, memimg_datawidth,
                       bias1_dtype.width, bias1_addr,
                       max(int(math.ceil(axi_datawidth / bias1_dtype.width)), par_och1))
    if scale1_shape is not None:
        axi.set_memory(mem, vscale1, memimg_datawidth,
                       scale1_dtype.width, scale1_addr,
                       max(int(math.ceil(axi_datawidth / scale1_dtype.width)), par_och1))

    axi.set_memory(mem, vweight2, memimg_datawidth,
                   weight2_dtype.width, weight2_addr,
                   max(int(math.ceil(axi_datawidth / weight2_dtype.width)), par_ich2))
    if bias2_shape is not None:
        axi.set_memory(mem, vbias2, memimg_datawidth,
                       bias2_dtype.width, bias2_addr,
                       max(int(math.ceil(axi_datawidth / bias2_dtype.width)), par_och2))
    if scale2_shape is not None:
        axi.set_memory(mem, vscale2, memimg_datawidth,
                       scale2_dtype.width, scale2_addr,
                       max(int(math.ceil(axi_datawidth / scale2_dtype.width)), par_och2))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och2))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        # set custom addresses
        ng.sim.set_global_addrs(_saxi, tmp_addr, out.addr, act.addr, variable_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)
                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 7
0
def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    model = MatrixAdd()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_add.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.add):
            op.attribute(par=par)

    # create target hardware
    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_add',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # exec on pytorch
    model_a = va.astype(np.float32)
    model_b = vb.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    c_diff = vc - scaled_model_c
    c_err = c_diff / (scaled_model_c + 0.00000001)
    max_c_err = np.max(np.abs(c_err))

    # if max_c_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_c_err)

    # to memory image
    param_data = ng.export_ndarray([c])
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 8
0
def run(act_shape=(1, 7, 7, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 3, 3, 9),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        out_dtype=ng.int32,
        stride0=1,
        stride1=1,
        padding0=0,
        padding1=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='relu',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func0 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    layers.append(
        nn.Conv2d(weight1_shape[3],
                  weight1_shape[0],
                  weight1_shape[1],
                  stride=stride1,
                  padding=padding1))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func1 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_conv2d.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '1.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=out_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # default linear quantization
    if act_dtype.width >= 8:
        value_ranges = {'act': (-120, 120)}
    else:
        value_ranges = {
            'act': (-(2**(act_dtype.width - 1)), (2**(act_dtype.width - 1)))
        }

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_conv2d',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # verification data
    # if act_dtype.width > 4:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1]
    # else:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1]

    #vact = np.ones(act.shape)
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2**(vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None:
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    out_diff = vout - scaled_model_out
    out_err = out_diff / (scaled_model_out + 0.00000001)
    max_out_err = np.max(np.abs(out_err))

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, out_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 9
0
def run(act_shape=(1, 7, 7, 15),
        act_dtype=ng.int32,
        out_dtype=ng.int32,
        factors=(1, 2, 2, 1),
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    out = ng.upsampling2d(act, factors=factors, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out],
                            'matrix_upsampling2d',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape)

    vout = ng.verify.upsampling2d(vact, factors=factors, dtype=out_dtype)

    # to memory image
    size_max = int(math.ceil(
        max(act.memory_size, out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))
    axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 10
0
def run(act_shape=(1, 32, 32, 3),
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        with_batchnorm=True, disable_fusion=False,
        conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1,
        conv2d_concur_och=None, conv2d_stationary='filter',
        pool_par=1, elem_par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    if not with_batchnorm:
        raise ValueError('with_batchnorm must be True for ResNet18.')

    # pytorch model
    model = torchvision.models.resnet18(pretrained=False)

    model.conv1.in_channels = act_shape[-1]
    model.fc = nn.Linear(in_features=model.fc.in_features,
                         out_features=10, bias=True)

    # Pytorch to ONNX
    onnx_filename = 'resnet18.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=weight_dtype,
                                          default_constant_dtype=weight_dtype,
                                          default_operator_dtype=out_dtype,
                                          default_scale_dtype=scale_dtype,
                                          default_bias_dtype=bias_dtype,
                                          disable_fusion=disable_fusion)

    # default linear quantization
    value_ranges = {'act': (0, 255)}

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool,
                           ng.avg_pool_serial, ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out], 'onnx_resnet18', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2 ** (vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    mout = scaled_model_out.astype(np.int64)
    for bat in range(vout.shape[0]):
        vout_max = np.max(vout[bat])
        vout_max_index = list(vout[bat]).index(vout_max)
        mout_max = np.max(mout[bat])
        mout_max_index = list(mout[bat]).index(mout_max)
        print("# vout[%d]: max = %d, index = %d" % (bat, vout_max, vout_max_index))
        print("# mout[%d]: max = %d, index = %d" % (bat, mout_max, mout_max_index))

    # out_diff = vout - scaled_model_out
    # out_err = out_diff / (scaled_model_out + 0.00000001)
    # max_out_err = np.max(np.abs(out_err))
    # breakpoint()

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 11
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        bias_shape=None,
        scale_shape=None,
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        bias_dtype=ng.int32,
        scale_dtype=ng.int32,
        c_dtype=ng.int32,
        rshift_mul=None,
        rshift_sum=None,
        rshift_out=None,
        act_func=None,
        par_left_col=1,
        par_left_row=1,
        par_out_col=1,
        concur_out_col=None,
        stationary='right',
        left_ram_size=None,
        right_ram_size=None,
        bias_ram_size=None,
        scale_ram_size=None,
        out_ram_size=None,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    if bias_shape is not None:
        bias = ng.placeholder(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.placeholder(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    transposed_a = False
    transposed_b = True

    c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul,
                  rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                  ng.int32, 'matmul', par_left_col, par_left_row, par_out_col,
                  concur_out_col, stationary, left_ram_size, right_ram_size,
                  bias_ram_size, scale_ram_size, None, None, None,
                  out_ram_size)

    targ = ng.to_veriloggen([c],
                            'matrix_matmul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3]

    if bias is not None:
        vbias = np.arange(bias.length, dtype=np.int64).reshape(
            bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length, dtype=np.int64).reshape(
            scale.shape) % [6]
    else:
        vscale = None

    vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul,
                          rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                          ng.int32, 'matmul', par_left_col, par_left_row,
                          par_out_col, concur_out_col, stationary,
                          left_ram_size, right_ram_size, bias_ram_size,
                          scale_ram_size, None, None, None, out_ram_size,
                          False, a_dtype, b_dtype, bias_dtype, scale_dtype)

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size,
                bias.memory_size if bias is not None else 0, scale.memory_size
                if scale is not None else 0, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(
        mem, va, memimg_datawidth, a_dtype.width, a.addr,
        max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col))

    axi.set_memory(
        mem, vb, memimg_datawidth, b_dtype.width, b.addr,
        max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col))

    if bias is not None:
        axi.set_memory(
            mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr,
            max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col))

    if scale is not None:
        axi.set_memory(
            mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr,
            max(int(math.ceil(axi_datawidth / scale_dtype.width)),
                par_out_col))

    axi.set_memory(
        mem, vc, memimg_datawidth, c_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(c.shape[0]):
            for j in range(c.shape[1]):
                orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print(i, j, orig, check)
                    ok = False

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 12
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        interrupt_name='irq',
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    d = ng.add(a, b, dtype=c_dtype, par=par)
    e = ng.add(b, a, dtype=c_dtype, par=par)

    # SW returns ng.add(x, y)
    f = ng.extern([d, e], shape=a_shape, opcode=0x1, func=lambda x, y: x + y)
    g = ng.sub(f, a)

    # SW returns d as-is
    h = ng.extern([g], shape=a_shape, opcode=0x2, func=lambda x: x)
    c = ng.sub(h, b)

    targ = ng.to_veriloggen([c],
                            'matrix_extern',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'interrupt_name': interrupt_name
                            })

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    irq = ports[interrupt_name]
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg_datawidth=memimg_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def irq_join(saxi, irq_bit):
        while irq == 0:
            pass
        araddr = ng.control_reg_interrupt_isr * 4
        irq_stat = saxi.read(araddr)

        if irq_stat != irq_bit:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()

        print('# irq stat = %d' % irq_stat)
        awaddr = ng.control_reg_interrupt_iar * 4
        saxi.write(awaddr, irq_bit)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        araddr_ext_snd = ng.control_reg_extern_send * 4
        awaddr_ext_rcv = ng.control_reg_extern_recv * 4
        awaddr_irq_ier = ng.control_reg_interrupt_ier * 4
        araddr_irq_isr = ng.control_reg_interrupt_isr * 4
        awaddr_irq_iar = ng.control_reg_interrupt_iar * 4
        _saxi.write(awaddr_irq_ier, 3)  # irq enable

        ng.sim.sw_rst(_saxi)

        print('# 0st software reset (during idle)')

        for i in range(100):
            pass

        irq_stat = _saxi.read(araddr_irq_isr)
        if irq_stat != 0:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()
        print('# irq stat = %d' %
              irq_stat)  # no irq busy by software reset when idle

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 1st test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        # software reset
        ng.sim.sw_rst(_saxi)

        print('# 1st software reset (before resume)')

        # from extern-send
        irq_join(_saxi, 1)

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send

        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_1st = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_1st = False
                # else:
                #    print('OK', i, j, orig, check)

        # 2nd test

        # start
        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 2nd test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        while (memory.waddr.awvalid) == 0:
            pass

        ng.sim.sw_rst(_saxi)

        print('# 2nd software reset (during Master AXI transaction)')

        irq_join(_saxi, 1)  # irq busy by software reset

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        araddr = ng.control_reg_extern_send * 4
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # termination
        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_2nd = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_2nd = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok_1st and ok_2nd:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 13
0
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28):
    # create target hardware

    # layer 0: conv2d, max_pool_serial, relu
    input_layer = ng.placeholder(ng.int32,
                                 shape=(1, row, col, ich),
                                 name='input_layer')
    w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0')
    a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1))
    a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))
    a0 = ng.relu(a0)

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(ng.int32,
                     shape=(ch, ksize, ksize, a0.shape[-1]),
                     name='w1')
    a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1))
    a1 = ng.relu(a1)
    a1 = ng.reshape(a1, [-1])

    # layer 2: full-connection
    w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2')
    a2 = ng.matmul(a1, w2, transposed_b=True)
    a2 = ng.relu(a2)

    # layer 3: full-connection
    w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3')
    output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer')

    targ = ng.to_veriloggen([output_layer], 'cnn')
    #targ = ng.to_ipxact([output_layer], 'cnn')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    return m
Exemplo n.º 14
0
def run(a_shape=(15, 15), b_shape=(15, 15),
        a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32,
        par=1, axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    d = ng.add(a, b, dtype=c_dtype, par=par)
    e = ng.add(b, a, dtype=c_dtype, par=par)

    # SW returns ng.add(x, y)
    f = ng.extern([d, e], shape=a_shape, opcode=0x1)
    g = ng.sub(f, a)

    # SW returns d as-is
    h = ng.extern([g], shape=a_shape, opcode=0x2)
    c = ng.sub(h, b)

    targ = ng.to_veriloggen([c], 'matrix_extern', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16]

    vd = ng.verify.add(va, vb, dtype=c_dtype, par=par,
                       x_dtype=a_dtype, y_dtype=b_dtype)
    ve = ng.verify.add(vb, va, dtype=c_dtype, par=par,
                       x_dtype=b_dtype, y_dtype=a_dtype)

    vf = ng.verify.extern([vd, ve], shape=a_shape, opcode=0x1,
                          func=lambda x, y: x + y)
    vg = ng.verify.sub(vf, va,
                       x_dtype=c_dtype, y_dtype=c_dtype)

    vh = ng.verify.extern([vg], shape=a_shape, opcode=0x2,
                          func=lambda x: x)
    vc = ng.verify.sub(vh, vb,
                       x_dtype=c_dtype, y_dtype=c_dtype)

    # to memory image
    size_max = int(math.ceil(max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth,
                   a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth,
                   b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth,
                   c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg_datawidth=memimg_datawidth,
                                memimg=mem, memimg_name=memimg_name)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        # from extern-send
        araddr = ng.control_reg_extern_send * 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        awaddr = ng.control_reg_extern_recv * 4
        _saxi.write(awaddr, 1)

        # from extern-send
        araddr = ng.control_reg_extern_send * 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        awaddr = ng.control_reg_extern_recv * 4
        _saxi.write(awaddr, 1)

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j,
                                        c.addr, c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Exemplo n.º 15
0
def mkTest(n_input=784, n_classes=10):
    # create target hardware
    x = ng.placeholder(ng.int32, shape=[n_input])

    w1 = ng.variable(ng.int32, shape=(n_input, n_input), name='h1')
    w2 = ng.variable(ng.int32, shape=(n_input, n_input), name='h2')
    w3 = ng.variable(ng.int32, shape=(n_classes, n_input), name='out')

    l1 = ng.matmul(x, w1, transposed_b=True)
    l1 = ng.relu(l1)

    l2 = ng.matmul(l1, w2, transposed_b=True)
    l2 = ng.relu(l2)

    out = ng.matmul(l2, w3, transposed_b=True)

    targ = ng.to_veriloggen([out], 'mlp')
    #targ = ng.to_ipxact([model], 'mlp')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m