示例#1
0
def test_depthwise():
    """Test depthwise operation, preceeded by DMA operation"""
    weights_src = NpuAddressRange(region=0, address=0x40, length=96)
    weights_dest = NpuAddressRange(region=1, address=0x10000, length=96)
    dma_op = NpuDmaOperation(weights_src, weights_dest)
    op = NpuConvDepthWiseOperation()
    ifm_quant = NpuQuantization(scale_f32=0.007843138, zero_point=128)
    op.ifm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x0, quant=ifm_quant)
    ofm_quant = NpuQuantization(scale_f32=0.062745101749897, zero_point=128)
    op.ofm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x8000, quant=ofm_quant)
    op.kernel = NpuKernel(3, 3)
    op.padding = NpuPadding(top=1, left=1, right=1, bottom=1)
    op.weights = [weights_dest]
    op.biases = [NpuAddressRange(region=0, address=0, length=80)]
    op.block_config = NpuShape3D(height=-1, width=-1, depth=8)
    cmds = npu_generate_register_command_stream([dma_op, op], NpuAccelerator.Ethos_U55_128)
    check_cmd0(cmds, cmd0.NPU_SET_DMA0_SRC_REGION, 0)
    check_cmd1(cmds, cmd1.NPU_SET_DMA0_SRC, 0x40)
    check_cmd0(cmds, cmd0.NPU_SET_DMA0_DST_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_DMA0_DST, 0x10000)
    check_cmd1(cmds, cmd1.NPU_SET_DMA0_LEN, 96)
    check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0)
    # A DMA WAIT should have been inserted
    check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0)
    check_cmd0(cmds, cmd0.NPU_OP_DEPTHWISE, 0)
    blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
    blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
    assert blk_height > 0
    assert blk_width > 0
示例#2
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    base_addresses : List[util.BaseAddress]
        base addresses to be used by the driver
    """

    # The NPU has 6 usable regions ranging from 0-6
    # The regions 0, 3, and 4 is already used for input,
    # output and constant, respectively (See _get_regions()).
    # Thus, for scratch we are left with 5, 2 and 1.
    candidate_regions_for_scratch = [5, 2, 1]
    (
        scratch_region_map,
        dynamic_allocation_size,
        dynamic_allocation_region,
    ) = analyze_scratch_memory_acesses(tir_module,
                                       candidate_regions_for_scratch)
    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_data = assign_addresses(buffer_info, _npu_ops,
                                               scratch_region_map)
    base_addresses = extract_param_base_addresses(tir_module, buffer_info,
                                                  scratch_region_map)
    if dynamic_allocation_size:
        base_addresses.append(
            util.BaseAddress(
                name="dynamic_allocation",
                primfunc_param_idx=None,
                region=dynamic_allocation_region,
                size=dynamic_allocation_size,
                is_runtime_allocation=True,
            ))
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops,
                                                     target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    return payload.hex(), constant_data, base_addresses
示例#3
0
def test_two_operations():
    """Tests code generation with 2 operations"""
    op1 = create_fully_connected_op()
    op2 = create_avg_pool_op()
    cmds = npu_generate_register_command_stream([op1, op2], NpuAccelerator.Ethos_U55_64)
    check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
    check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
    check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
    # The operations are not dependent, so expect a blockdep 3
    check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 3)
    assert len(cmds) > 10
示例#4
0
def test_dma_op():
    """Tests DMA operation followed by average pool. The DMA provides the contents of the average pool's IFM."""
    pool_op = create_avg_pool_op()
    assert pool_op.ofm is not None
    dest = get_address_ranges(pool_op.ofm)[0]
    assert dest is not None
    src = NpuAddressRange(0, 0x24000, dest.length)
    dma_op = NpuDmaOperation(src, dest)
    cmds = npu_generate_register_command_stream([dma_op, pool_op], NpuAccelerator.Ethos_U55_64)
    check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0)
    # A DMA WAIT should have been inserted
    check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0)
    check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
示例#5
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    base_addresses : List[util.BaseAddress]
        base addresses to be used by the driver
    """

    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops)
    base_addresses = extract_param_base_addresses(tir_module, buffer_info)
    if scratch_size > 0:
        base_addresses.append(
            util.BaseAddress(
                "scratch",
                None,
                _REGION_MAP[BufferType.scratch],
                scratch_size,
                True,
            )
        )
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    return payload.hex(), constant_data, base_addresses
示例#6
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    scratch_size : int
        The size of the scratch buffer needed.
    """

    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_tensor, scratch_size = assign_addresses(
        buffer_info, _npu_ops)
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops,
                                                     target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    hex_value = "" if constant_tensor is None else constant_tensor.tobytes(
    ).hex()
    return payload.hex(), hex_value, scratch_size
示例#7
0
def test_conv2d():
    """Tests command stream generation for a conv2d operation"""
    op = NpuConv2DOperation()
    op.ifm = create_feature_map(
        NpuShape3D(height=30, width=62, depth=46), 1, 512, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128)
    )
    op.ofm = create_feature_map(
        NpuShape3D(height=30, width=31, depth=46),
        1,
        0x14E40,
        quant=NpuQuantization(scale_f32=0.20392157, zero_point=128),
    )
    op.kernel = NpuKernel(3, 2, 2, 1)
    op.weights = [NpuAddressRange(region=0, address=0, length=7696)]
    op.biases = [NpuAddressRange(region=0, address=32000, length=464)]
    op.padding = NpuPadding(top=0, left=0, right=1, bottom=1)
    op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
    # In this example we assume that the weights were compressed with ofm depth 16;
    # let vela choose suitable block width and height by setting these to -1
    op.block_config = NpuShape3D(height=-1, width=-1, depth=16)
    cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_128)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 512)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 29)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 29)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 61)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 45)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 2852)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 46)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 128)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_TOP, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_LEFT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_BOTTOM, 1)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_RIGHT, 1)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 85568)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 29)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 29)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 29)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 45)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 1426)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 46)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 128)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_KERNEL_HEIGHT_M1, 1)
    check_cmd0(cmds, cmd0.NPU_SET_KERNEL_WIDTH_M1, 2)
    check_cmd0(cmds, cmd0.NPU_SET_KERNEL_STRIDE, 5)
    check_cmd0(cmds, cmd0.NPU_SET_WEIGHT_REGION, 0)
    check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_BASE, 0)
    check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_LENGTH, 7696)
    check_cmd0(cmds, cmd0.NPU_SET_SCALE_REGION, 0)
    check_cmd1(cmds, cmd1.NPU_SET_SCALE_BASE, 32000)
    check_cmd1(cmds, cmd1.NPU_SET_SCALE_LENGTH, 464)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14)
    check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14)
    check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
    check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
    # Check that block width/height were generated that fit
    blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
    blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
    assert blk_height > 0
    assert blk_width > 0
    assert (blk_height + 1) * (blk_width + 1) <= 64
示例#8
0
def test_avg_pool():
    """Tests average pool operation"""
    op = create_avg_pool_op()
    cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_128)
    check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
    assert len(cmds) > 10
示例#9
0
def test_mul_with_broadcast_and_relu():
    """Test multiplication with broadcasted IFM2"""
    op = NpuElementWiseOperation(NpuElementWiseOp.MUL)
    op.ifm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x20)
    op.ifm2 = create_feature_map(NpuShape3D(height=1, width=22, depth=1), 1, 0)
    op.ofm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x52C0)
    op.activation = NpuActivation(NpuActivationOp.NONE_OR_RELU)
    op.activation.min = 0  # RELU
    # Do not set a block config, let vela choose one
    cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_32)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_SCALE, 1073741824, 30)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 32)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 21)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 30)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 682)
    check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 31)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 21184)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 21)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 30)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 21)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 30)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 682)
    check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 31)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 256)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0)
    check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_REGION, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE0, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE1, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE2, 0)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE3, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT0_M1, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT1_M1, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_WIDTH0_M1, 21)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_C, 1)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_Y, 22)
    check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_X, 1)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_ZERO_POINT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 23)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 31)
    check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16)
    check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16)
    check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9)
    check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
    check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
    check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0)
    # Check that block width/height were generated that fit
    blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
    blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
    blk_depth = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1)
    assert blk_height >= 0
    assert blk_width >= 0
    assert blk_depth >= 0
    assert (blk_height + 1) * (blk_width + 1) + (blk_depth + 1) <= 3072
示例#10
0
def test_fully_connected():
    """Tests command stream generation for a fully connected operation"""
    op = create_fully_connected_op()
    cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_128)
    check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
    assert len(cmds) > 20