def _check_buffer(address, region, length, buffer_var):
     """Checks whether the buffer information is valid with
     original tir buffers.
     - If its constant, this will check
       the slice in the constant tensor has the values.
     - If its scratch, this will check
       the slice is within scratch and does not have conflicts
       with other scratch tensors.
     - If its input/output, this will check the
       address is zero
     """
     inverse_region_map = {
         0: tir_to_cs_translator.BufferType.constant,
         1: tir_to_cs_translator.BufferType.scratch,
         3: tir_to_cs_translator.BufferType.input,
         4: tir_to_cs_translator.BufferType.output,
     }
     buffer_type = inverse_region_map[region]
     if buffer_type == tir_to_cs_translator.BufferType.constant:
         ref = buffer_info[buffer_var].values
         assert (constant_tensor[address : address + length] == ref).all()
         # Every buffer is adjusted to align to 16 bytes
         length = util.round_up(length, 16)
         # Mark these constants are read at least once
         constant_tensor_read_mask[address : address + length] = np.ones(length, dtype="uint8")
     elif buffer_type == tir_to_cs_translator.BufferType.scratch:
         shape = list(buffer_info[buffer_var].shape)
         assert length == np.prod(shape)
         assert address < scratch_size
         # Every buffer is adjusted to align to 16 bytes
         length = util.round_up(length, 16)
         assert address + length <= scratch_size
         # The scratch area should not be used by anyother buffer
         assert not scratch_allocation_mask[address : address + length].any()
         # The scratch area is marked as used
         scratch_allocation_mask[address : address + length] = np.ones(length, dtype="uint8")
     elif buffer_type == tir_to_cs_translator.BufferType.input:
         assert address == 0
     else:
         assert buffer_type == tir_to_cs_translator.BufferType.output
         assert address == 0
예제 #2
0
def calculate_block_traversal_mode(
        is_depthwise: bool, weights_shape_ohwi: List[int],
        ifm_bitdepth: int) -> vapi.NpuBlockTraversal:
    """Calculate a block traversal mode given whether the op is depthwise convolution,
    shape of weights and bit-depth of the ifm.
    """

    if is_depthwise:
        return vapi.NpuBlockTraversal.DEPTH_FIRST
    # Determine which block traversal strategy has better DPU utilization
    kernel_size = weights_shape_ohwi[1] * weights_shape_ohwi[2]
    depth_utilization = weights_shape_ohwi[3] / util.round_up(
        weights_shape_ohwi[3], 32 if ifm_bitdepth == 8 else 16)
    part_kernel_utilization = (weights_shape_ohwi[3] / util.round_up(
        weights_shape_ohwi[3], 8)) * (kernel_size / util.round_up(
            kernel_size, 4 if ifm_bitdepth == 8 else 2))
    if part_kernel_utilization >= depth_utilization or weights_shape_ohwi[
            3] <= 8:
        # Part-kernel first is always better for ifm depths <= 8
        return vapi.NpuBlockTraversal.PART_KERNEL_FIRST
    return vapi.NpuBlockTraversal.DEPTH_FIRST
예제 #3
0
 def analyze_remaining_allocates(stmt):
     nonlocal dynamic_allocation_size
     if isinstance(stmt, tvm.tir.stmt.Allocate):
         allocate = stmt
         pointer_type = allocate.buffer_var.type_annotation
         storage_scope = pointer_type.storage_scope
         if storage_scope == "global":
             dtype_bytes = np.iinfo(np.dtype(allocate.dtype)).bits // 8
             size_in_bytes = int(dtype_bytes *
                                 np.prod(list(allocate.extents)))
             # Every memory address the NPU access have to be 16 byte aligned
             size_in_bytes = util.round_up(size_in_bytes, 16)
             address = dynamic_allocation_size
             dynamic_allocation_size += size_in_bytes
             scratch_region_map[allocate.buffer_var] = RegionOffset(
                 region=dynamic_allocation_region, offset=address)
예제 #4
0
def assign_addresses(buffer_info, npu_ops, scratch_region_map):
    """This function will assign addresses to tensors
    within two buffers : scratch and constants.
    The scratch is the buffer created to hold all intermediary data
    The constants is the buffer created via unifying all the constant data
    (post-encoding).
    Parameters
    ----------
    buffer_info : dict
        This is the dictionary obtained via calling extract_buffer_info.
        The key is the buffer name to BufferInfo
    npu_ops : list
        A list of Vela NpuOps with tir.BufferLoads for addresses
        A list of Vela NpuOps with tir.Loads for addresses
    scratch_region_map : Dict[tvm.tir.Var, RegionOffset]
        A buffer_var to region and offset map.
    Returns
    -------
    npu_ops : list
        A list of Vela NpuOps with addesses within scratch and constant buffers
    constant_tensor : NDArray
        A unified constant data array of uint8 as the constant buffer
    """
    def replace_npu_fm_with_address(npu_fm):
        assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.BufferLoad)
        buffer = npu_fm.tiles.addresses[0].buffer.data
        if buffer in scratch_region_map.keys():
            address = scratch_region_map[buffer].offset
            region = scratch_region_map[buffer].region
        else:
            assert buffer in buffer_addresses.keys()
            address, buffer_type = buffer_addresses[buffer]
            region = _get_region(buffer_type)
        assert (len(npu_fm.tiles.addresses[0].indices) == 1
                ), "Ethos-U translation expects flattened buffers"
        index = npu_fm.tiles.addresses[0].indices[0] * (
            np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8)
        npu_fm.tiles.addresses[0] = address + int(index)
        npu_fm.tiles.addresses[1] = (address if isinstance(
            npu_fm.tiles.addresses[1], tvm.tir.BufferLoad) else 0)
        npu_fm.tiles.addresses[2] = (address if isinstance(
            npu_fm.tiles.addresses[2], tvm.tir.BufferLoad) else 0)
        npu_fm.tiles.addresses[3] = 0
        npu_fm.region = region
        return npu_fm

    def replace_npu_address_range_with_address(npu_addr_range):
        assert isinstance(npu_addr_range.address, tvm.tir.BufferLoad)
        buffer = npu_addr_range.address.buffer.data
        index = int(npu_addr_range.address.indices[0] *
                    (np.iinfo(np.dtype(npu_addr_range.address)).bits // 8))
        if buffer in scratch_region_map.keys():
            return vapi.NpuAddressRange(
                scratch_region_map[buffer].region,
                scratch_region_map[buffer].offset + index,
                npu_addr_range.length,
            )
        assert buffer in buffer_addresses.keys(
        ), f"searching for buffer : {buffer}, but not found"
        address, buffer_type = buffer_addresses[buffer]
        address = address + int(npu_addr_range.address.indices[0].value)
        return vapi.NpuAddressRange(_get_region(buffer_type), address,
                                    npu_addr_range.length)

    def replace_tir_loads(npu_object):
        if isinstance(npu_object, vapi.NpuFeatureMap):
            return replace_npu_fm_with_address(npu_object)
        if isinstance(npu_object, vapi.NpuAddressRange):
            return replace_npu_address_range_with_address(npu_object)
        return npu_object

    def classify_io(buffer):
        for _npu_op in npu_ops:
            if issubclass(type(_npu_op), vapi.NpuBlockOperation):
                if _npu_op.ifm and _npu_op.ifm.tiles.addresses[
                        0].buffer.data == buffer:
                    return BufferType.input
                if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[
                        0].buffer.data == buffer:
                    return BufferType.input
                if _npu_op.ofm and _npu_op.ofm.tiles.addresses[
                        0].buffer.data == buffer:
                    return BufferType.output

        raise ValueError(f"Unused IO : {buffer} in tir module.")

    constant_hex_data = []
    total_constant_len = 0
    buffer_addresses = dict()
    for _buffer, info in buffer_info.items():
        if info.values is not None:
            assert info.btype == BufferType.constant
            assert len(info.shape) == 1
            buffer_addresses[_buffer] = ((total_constant_len,
                                          info.btype) if constant_hex_data else
                                         (0, info.btype))
            dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8
            size_in_bytes = dtype_bytes * np.prod(list(info.shape))
            # Every memory address the NPU access have to be 16 byte aligned
            size_in_bytes = util.round_up(size_in_bytes, 16)
            constant_tensor = np.resize(info.values,
                                        size_in_bytes // dtype_bytes)
            constant_tensor = constant_tensor.tobytes().hex()
            constant_hex_data.append(constant_tensor)
            total_constant_len += len(constant_tensor) // 2
        else:
            if info.btype == BufferType.input_or_output or info.btype == BufferType.input:
                buffer_type = info.btype
                if info.btype == BufferType.input_or_output:
                    buffer_type = classify_io(_buffer)
                assert buffer_type in (BufferType.input, BufferType.output)
                address = 0
                buffer_addresses[_buffer] = (address, buffer_type)
                buffer_info[_buffer] = BufferInfo(values=None,
                                                  shape=info.dtype,
                                                  dtype=info.dtype,
                                                  btype=buffer_type)
            elif info.btype == BufferType.shram:
                accl_config = util.get_accelerator_config()
                arch_config = get_accelerator_arch_config(accl_config)
                address = arch_config.lut_start_address
                buffer_addresses[_buffer] = (address, info.btype)
            else:
                # These buffer_vars are already updated in scratch_region_map
                assert info.btype == BufferType.scratch

    for npu_op in npu_ops:
        for attr_name, attr in npu_op.__dict__.items():
            if isinstance(attr, list):
                new_attr = list()
                for attr_ in attr:
                    new_attr.append(replace_tir_loads(attr_))
                setattr(npu_op, attr_name, new_attr)
            else:
                setattr(npu_op, attr_name, replace_tir_loads(attr))

    constant_data = "".join(constant_hex_data)
    return (npu_ops, constant_data)
예제 #5
0
def assign_addresses(buffer_info, npu_ops):
    """This function will assign addresses to tensors
    within two buffers : scratch and constants.
    The scratch is the buffer created to hold all intermediary data
    The constants is the buffer created via unifying all the constant data
    (post-encoding).
    Parameters
    ----------
    buffer_info : dict
        This is the dictionary obtained via calling extract_buffer_info.
        The key is the buffer name to BufferInfo
    npu_ops : list
        A list of Vela NpuOps with tir.Loads for addresses
    Returns
    -------
    npu_ops : list
        A list of Vela NpuOps with addesses within scratch and constant buffers
    constant_tensor : NDArray
        A unified constant data array of uint8 as the constant buffer
    scratch_size : int
        The size of the scratch tensor.
    """
    def replace_npu_fm_with_address(npu_fm):
        assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.Load)
        # We currently does not support tiles
        # Change this when tiles are needed
        # (i.e. when using rolling buffers)
        assert npu_fm.tiles.addresses[1:] == [0, 0, 0]
        npu_fm.tiles.addresses[1:] = [0, 0, 0]
        buffer = npu_fm.tiles.addresses[0].buffer_var
        assert buffer in buffer_addresses.keys()
        address, buffer_type = buffer_addresses[buffer]
        index = npu_fm.tiles.addresses[0].index * (
            np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8)
        npu_fm.tiles.addresses[0] = address + int(index)
        npu_fm.region = _REGION_MAP[buffer_type]
        return npu_fm

    def replace_npu_address_range_with_address(npu_addr_range):
        assert isinstance(npu_addr_range.address, tvm.tir.Load)
        buffer = npu_addr_range.address.buffer_var
        assert buffer in buffer_addresses.keys(
        ), f"searching for buffer : {buffer}, but not found"
        address, buffer_type = buffer_addresses[buffer]
        return vapi.NpuAddressRange(_REGION_MAP[buffer_type], address,
                                    npu_addr_range.length)

    def replace_tir_loads(npu_object):
        if isinstance(npu_object, vapi.NpuFeatureMap):
            return replace_npu_fm_with_address(npu_object)
        if isinstance(npu_object, vapi.NpuAddressRange):
            return replace_npu_address_range_with_address(npu_object)
        return npu_object

    def classify_io(buffer):
        for _npu_op in npu_ops:
            if issubclass(type(_npu_op), vapi.NpuBlockOperation):
                if _npu_op.ifm and _npu_op.ifm.tiles.addresses[
                        0].buffer_var == buffer:
                    return BufferType.input
                if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[
                        0].buffer_var == buffer:
                    return BufferType.input
                if _npu_op.ofm and _npu_op.ofm.tiles.addresses[
                        0].buffer_var == buffer:
                    return BufferType.output

        raise ValueError(f"Unused IO : {buffer} in tir module.")

    scratch_size = 0
    constant_tensor = None
    buffer_addresses = dict()
    for _buffer, info in buffer_info.items():
        if info.values is not None:
            assert np.dtype(info.dtype) == np.uint8
            assert info.btype == BufferType.constant
            assert len(info.shape) == 1
            if constant_tensor is None:
                buffer_addresses[_buffer] = (0, info.btype)
                assert info.values.dtype == np.uint8
                size_in_bytes = info.values.size
                # Every memory address the NPU access have to be 16 byte aligned
                size_in_bytes = util.round_up(size_in_bytes, 16)
                constant_tensor = np.resize(info.values, size_in_bytes)
            else:
                buffer_addresses[_buffer] = (constant_tensor.size, info.btype)
                assert info.values.dtype == np.uint8
                size_in_bytes = info.values.size
                # Every memory address the NPU access have to be 16 byte aligned
                size_in_bytes = util.round_up(size_in_bytes, 16)
                constant_tensor = np.append(
                    constant_tensor, np.resize(info.values, size_in_bytes))
        else:
            if info.btype == BufferType.input_or_output:
                buffer_type = classify_io(_buffer)
                assert buffer_type in (BufferType.input, BufferType.output)
                address = 0
                buffer_addresses[_buffer] = (address, buffer_type)
            elif info.btype == BufferType.shram:
                accl_config = util.get_accelerator_config()
                arch_config = get_accelerator_arch_config(accl_config)
                address = arch_config.lut_start_address
                buffer_addresses[_buffer] = (address, info.btype)
            else:
                size_in_bytes = int(
                    (np.iinfo(np.dtype(info.dtype)).bits // 8) *
                    np.prod(list(info.shape)))
                # Every memory address the NPU access have to be 16 byte aligned
                size_in_bytes = util.round_up(size_in_bytes, 16)
                assert info.btype == BufferType.scratch
                address = scratch_size
                scratch_size += size_in_bytes
                buffer_addresses[_buffer] = (address, info.btype)

    for npu_op in npu_ops:
        for attr_name, attr in npu_op.__dict__.items():
            if isinstance(attr, list):
                new_attr = list()
                for attr_ in attr:
                    new_attr.append(replace_tir_loads(attr_))
                setattr(npu_op, attr_name, new_attr)
            else:
                setattr(npu_op, attr_name, replace_tir_loads(attr))

    return npu_ops, constant_tensor, scratch_size