def test_small_graph(): subgraph = TESubgraph([], None) part_a = InlinePart( subgraph, [ Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), Propagator( [[0, 1, 0], [1, 0, 0], [0, 0, 1]], [-1, -1], ), ], ) part_b = InlinePart( subgraph, [ Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = Tensor([10, 10], "uint8") tensor_2 = Tensor([9, 9], "uint8") tensor_3 = Tensor([10, 10], "uint8") tensor_4 = Tensor([10, 10], "uint8") part_a.set_input(0, tensor_1) part_a.set_input(1, tensor_2) part_a.set_output(tensor_3) tensor_1.add_consumer(part_a) tensor_2.add_consumer(part_a) tensor_3.add_producer(part_a) part_b.set_input(0, tensor_3) part_b.set_output(tensor_4) tensor_3.add_consumer(part_b) tensor_4.add_producer(part_b) assert part_a.input_tensors == [tensor_1, tensor_2] assert part_a.output_tensor == tensor_3 assert part_b.input_tensors == [tensor_3] assert part_b.output_tensor == tensor_4 assert tensor_1.producers == [] assert tensor_1.consumers == [part_a] assert tensor_2.producers == [] assert tensor_2.consumers == [part_a] assert tensor_3.producers == [part_a] assert tensor_3.consumers == [part_b] assert tensor_4.producers == [part_b] assert tensor_4.consumers == [] graph = CascaderGraph([tensor_1, tensor_2], [tensor_4]) assert graph.input_tensors == [tensor_1, tensor_2] assert graph.output_tensors == [tensor_4] assert graph.part_order == [part_b, part_a] for i, part in enumerate(graph.part_order): assert graph.get_part_id(part) == i
def test_inline_part(): subgraph = TESubgraph([], None) part = InlinePart( subgraph, [ Propagator( [[0, 1, 0], [1, 0, 0], [0, 0, 1]], [0, 0], ), ], ) output_stripe_config = StripeConfig([2, 4], [8, 8], [2, 4], [1, 2], [4, 2], [0, 0]) input_stripe_config = StripeConfig([4, 2], [8, 8], [4, 2], [2, 1], [2, 4], [0, 0]) assert part.input_tensors == [None] assert part.output_tensor == None assert len(part.propagators) == 1 assert part.in_line == True assert part.get_stripe_align_hint() == [1, 1] performance_info = part.get_performance_info(output_stripe_config, is_rolling=False) assert performance_info.compute_cycles == 0 assert performance_info.read_bytes == [0] assert performance_info.write_bytes == 0 input_stripe_configs = part.calculate_input_stripe_configs( output_stripe_config) assert len(input_stripe_configs) == 1 assert input_stripe_configs[0] == input_stripe_config
def test_propagator(): transform = [ [1, 0, 0, 0], [0, 1 / 2, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1], ] offset = [-1, 1, 2] propagator = Propagator( transform=transform, offset=offset, ) assert list(propagator.offset) == offset for i, row in enumerate(transform): for j, value in enumerate(row): assert isclose(propagator.transform[i][j], value)
def match_ethosu_inline(output_tensor, device_config): """Match a Tensor Expression corresponding to an operator that will be inlined. If the Tensor Expression matches, an InlinePart will be created that models the matched Tensor Expression. Otherwise, None will be returned. This matcher is naive and assumes nothing about the compute of the Tensor Expression. Therefore, the resulting InlinePart will have full-tensor dependencies (i.e. each output element depends on every input element). Parameters ---------- output_tensor : tvm.te.Tensor The tensor to attempt to match with. device_config : EthosuDeviceConfig Target device configuration Returns ------- Union[None, InlinePart] The created InlinePart if there was a match, otherwise None. """ if output_tensor.op.name not in INLINE_OPS: return None input_tensors = output_tensor.op.input_tensors propagators = [] output_dims = len(output_tensor.shape) for input_tensor in input_tensors: input_dims = len(input_tensor.shape) transform_matrix = np.zeros((input_dims + 1, output_dims + 1)) for i, axis in enumerate(input_tensor.shape): transform_matrix[i, output_dims] = int(axis) transform_matrix[input_dims, output_dims] = 1 offset_vector = np.zeros(input_dims, dtype="int64") propagators.append( Propagator( transform_matrix.tolist(), offset_vector.tolist(), )) subgraph = TESubgraph(input_tensors, output_tensor) return InlinePart( subgraph, propagators, )
def unary_elementwise_compute( ifm: te.Tensor, lut: te.Tensor, operator_type: str, ifm_scale: float, ifm_zero_point: int, ofm_scale: float, ofm_zero_point: int, ofm_channels: int, activation: str, clip_min: int, clip_max: int, rounding_mode: str, ifm_layout: str, ofm_layout: str, ) -> te.Tensor: """A compute operator representing the capabilities of unary_elementwise for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). lut : te.Tensor The look-up table values to use if activation = "LUT". operator_type: str The type of the unary elementwise operator. "ABS" "CLZ" ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. ofm_channels : int The number of OFM channels. activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. ifm_layout : str, optional The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_layout : str, optional The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". Returns ------- te.Tensor The OFM tensor. """ assert ifm.shape[0] == 1 assert ifm_layout in {"NHWC", "NHCWB16"} assert ofm_layout in {"NHWC", "NHCWB16"} # Changing the ifm and ofm scale to conform with that expected by Vela API ofm_scale = ifm_scale / ofm_scale ifm_scale = 1.0 # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale, ofm_channels, (0, 0, 0, 0)) # Unary elementwise compute operation ofm_height = dmaed_ifm.shape[1] ofm_width = dmaed_ifm.shape[2] unary_elementwise_attrs = { "op": "ethosu_unary_elementwise", "operator_type": operator_type, "activation": activation, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, } def clz_imp(inp): # Assuming that it's a 32 bit int return 32 - te.log2(inp) operators = {"ABS": te.abs, "CLZ": clz_imp} unary_elementwise = te.compute( (1, ofm_height, ofm_width, ofm_channels), lambda nn, hh, ww, cc: operators[operator_type] (dmaed_ifm(nn, hh, ww, cc).astype(ifm.dtype)), name="ethosu_unary_elementwise", attrs=unary_elementwise_attrs, ) nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] nhcwb16_to_nhwc = [ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 16, 0, 1, -16], [0, 0, 0, 0, 0, 1], ] ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], ] if ofm_layout == "NHCWB16": ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() if ifm_layout == "NHCWB16": ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() ifm_propagator = Propagator( ifm_matrix, [0, 0, 0, 0] if ifm_layout == "NHWC" else [0, 0, 0, 0, 0], ) propagator_attrs = {"ifm_propagator": ifm_propagator} # Compute operation for the OFM DMA pipeline return dma_ofm_compute( unary_elementwise, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels, attrs=propagator_attrs, )
def conv2d_compute( ifm: te.Tensor, weight: te.Tensor, scale_bias: te.Tensor, lut: te.Tensor, ifm_scale: float, ifm_zero_point: int, weight_zero_point: int, ofm_scale: float, ofm_zero_point: int, strides: Tuple[int, int], padding: Tuple[int, int, int, int], dilation: Union[Tuple[int, int], List[int]], activation: str, clip_min: int, clip_max: int, rounding_mode: str, upscale: str, ifm_layout: str, ofm_layout: str, ) -> te.Tensor: """A compute operator representing the capabilities of a 2D convolution for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). weight : te.Tensor The weight tensor. scale_bias : te.Tensor The packed per-channel weight scale and bias tensor. lut : te.Tensor The look-up table of values to use if activation = "LUT". ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. weight_zero_point : int The quantization zero point for the weight tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. strides : tuple The 2 dimensional strides as (stride_height, stride_width). padding : tuple The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right). dilation : Union[Tuple[int, int], List[int]] The 2 dimensional dilation as (dilation_height, dilation_width). activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. upscale : str The 2x2 upscaling mode to apply to the Input Feature Map tensor. "NONE" - no upscaling. "NEAREST" - upscale using nearest neighbour. "ZEROS" - upscale using zeros. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. ifm_layout : str The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_layout : str The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". Returns ------- te.Tensor The OFM tensor. """ assert ifm.shape[0] == 1 assert ifm_layout in {"NHWC", "NHCWB16"} assert ofm_layout in {"NHWC", "NHCWB16"} padding = [int(v) for v in padding] stride_h, stride_w = [int(v) for v in strides] dilation_h, dilation_w = [int(v) for v in dilation] ofm_channels, kernel_h, kernel_w, ifm_channels = [ int(v) for v in weight.shape ] upscale_factor = 2 if upscale != "NONE" else 1 # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute( ifm, ifm_layout, ifm_zero_point, ifm_scale, weight.shape[3], padding, upscale_factor, ) # 2D Convolution compute operation dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 ofm_height = (dmaed_ifm.shape[1] - dilated_kernel_h) // stride_h + 1 ofm_width = (dmaed_ifm.shape[2] - dilated_kernel_w) // stride_w + 1 rc = te.reduce_axis((0, ifm_channels), name="rc") rh = te.reduce_axis((0, kernel_h), name="ry") rw = te.reduce_axis((0, kernel_w), name="rx") conv2d_attrs = { "op": "ethosu_conv2d", "weight_zero_point": weight_zero_point, "activation": activation, "upscale": upscale, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, "stride_h": stride_h, "stride_w": stride_w, "dilation_h": dilation_h, "dilation_w": dilation_w, } has_lut = activation in ("TANH", "LUT", "SIGMOID") # This is a trick to insert the LUT tensor into the TE graph if LUT is present lut_expr = (lut[0] + lut[255]).astype(ifm.dtype) if has_lut else 0 # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT if has_lut: conv2d_attrs["lut"] = lut conv = te.compute( (1, ofm_height, ofm_width, ofm_channels), lambda nn, hh, ww, cc: te.sum( dmaed_ifm(nn, hh * stride_h + rh * dilation_h, ww * stride_w + rw * dilation_w, rc).astype(ifm.dtype) * weight[ cc, rh, rw, rc].astype(ifm.dtype) # This is a trick to load 10 elements of the scale_bias at once, not accurate maths + (scale_bias[cc, 0] * scale_bias[cc, 9] + lut_expr).astype(ifm. dtype), axis=[rh, rw, rc], ), name="ethosu_conv2d", attrs=conv2d_attrs, ) nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] nhcwb16_to_nhwc = [ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 16, 0, 1, -16], [0, 0, 0, 0, 0, 1], ] ifm_matrix = [ [1, 0, 0, 0, 0], [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], [0, 0, 0, 0, ifm_channels], [0, 0, 0, 0, 1], ] weights_matrix = [ [0, 0, 0, 1, 0], [0, 0, 0, 0, kernel_h], [0, 0, 0, 0, kernel_w], [0, 0, 0, 0, ifm_channels], [0, 0, 0, 0, 1], ] bias_matrix = [ [0, 0, 0, 1, 0], [0, 0, 0, 0, 10], [0, 0, 0, 0, 1], ] if ofm_layout == "NHCWB16": ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() weights_matrix = np.matmul(weights_matrix, nhcwb16_to_nhwc).tolist() bias_matrix = np.matmul(bias_matrix, nhcwb16_to_nhwc).tolist() if ifm_layout == "NHCWB16": ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() ifm_propagator = Propagator( ifm_matrix, [0, -padding[0], -padding[1], 0] if ifm_layout == "NHWC" else [0, -padding[0], 0, -padding[1], 0], ) weights_propagator = Propagator( weights_matrix, [0, 0, 0, 0], ) bias_propagator = Propagator( bias_matrix, [0, 0], ) propagator_attrs = { "ifm_propagator": ifm_propagator, "weights_propagator": weights_propagator, "bias_propagator": bias_propagator, } # Compute operation for the OFM DMA pipeline dma_ofm = dma_ofm_compute(conv, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels, attrs=propagator_attrs) return dma_ofm
def pooling_compute( ifm: te.Tensor, lut: te.Tensor, pooling_type: str, ifm_scale: float, ifm_zero_point: int, ofm_scale: float, ofm_zero_point: int, pool_shape: Tuple[int, int], ofm_channels: int, strides: Tuple[int, int], padding: Tuple[int, int, int, int], activation: str, clip_min: int, clip_max: int, rounding_mode: str, upscale: str, ifm_layout: str, ofm_layout: str, ) -> te.Tensor: """A compute operator representing the capabilities of pooling for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). lut : te.Tensor The look-up table of values to use if activation = "LUT". pooling_type: str The type of the pooling. "AVG" - average pool, "MAX" - max pool. ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. pool_shape : Tuple[int, int] The 2 dimensional pool shape as (pool_shape_height, pool_shape_width). ofm_channels : int The number of the Output Feature Map channels strides : Tuple[int, int] The 2 dimensional strides as (stride_height, stride_width). padding : Tuple[int, int, int, int] The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right). activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. upscale : str The 2x2 upscaling mode to apply to the Input Feature Map tensor. "NONE" - no upscaling. "NEAREST" - upscale using nearest neighbour. "ZEROS" - upscale using zeros. ifm_layout : str The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_layout : str The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". Returns ------- te.Tensor The OFM tensor. """ assert ifm.shape[0] == 1 assert ifm_layout in {"NHWC", "NHCWB16"} assert ofm_layout in {"NHWC", "NHCWB16"} padding = [int(v) for v in padding] stride_h, stride_w = [int(v) for v in strides] pool_shape_h, pool_shape_w = [int(v) for v in pool_shape] # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale, ofm_channels, padding) # Pooling compute operation ofm_height = (dmaed_ifm.shape[1] - pool_shape_h) // stride_h + 1 ofm_width = (dmaed_ifm.shape[2] - pool_shape_w) // stride_w + 1 rh = te.reduce_axis((0, pool_shape_h), name="ry") rw = te.reduce_axis((0, pool_shape_w), name="rx") pooling_attrs = { "op": "ethosu_pooling", "pooling_type": pooling_type, "pool_shape_h": pool_shape_h, "pool_shape_w": pool_shape_w, "stride_h": stride_h, "stride_w": stride_w, "activation": activation, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, "upscale": upscale, } has_lut = activation in ("TANH", "LUT", "SIGMOID") # This is a trick to insert the LUT tensor into the TE graph if LUT is present lut_expr = (lut[0] + lut[255]).astype(ifm.dtype) if has_lut else 0 # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT if has_lut: pooling_attrs["lut"] = lut pooling = te.compute( (1, ofm_height, ofm_width, ofm_channels), lambda nn, hh, ww, cc: te.max( (dmaed_ifm(nn, hh * stride_h + rh, ww * stride_w + rw, cc) + lut_expr).astype( ifm.dtype ), axis=[rh, rw], ), name="ethosu_pooling", attrs=pooling_attrs, ) nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] nhcwb16_to_nhwc = [ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 16, 0, 1, -16], [0, 0, 0, 0, 0, 1], ] ifm_matrix = [ [1, 0, 0, 0, 0], [0, stride_h, 0, 0, (pool_shape_h - stride_h)], [0, 0, stride_w, 0, (pool_shape_w - stride_w)], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], ] if ofm_layout == "NHCWB16": ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() if ifm_layout == "NHCWB16": ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() ifm_propagator = Propagator( ifm_matrix, [0, -padding[0], -padding[1], 0] if ifm_layout == "NHWC" else [0, -padding[0], 0, -padding[1], 0], ) propagator_attrs = { "ifm_propagator": ifm_propagator, } # Compute operation for the OFM DMA pipeline return dma_ofm_compute( pooling, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels, attrs=propagator_attrs )
def binary_elementwise_compute( ifm: te.Tensor, ifm2: te.Tensor, lut: te.Tensor, operator_type: str, ifm_scale: float, ifm_zero_point: int, ifm2_scale: float, ifm2_zero_point: int, ofm_scale: float, ofm_zero_point: int, ifm_channels: int, ifm2_channels: int, reversed_operands: bool, activation: str, clip_min: int, clip_max: int, rounding_mode: str, ifm_layout: str, ifm2_layout: str, ofm_layout: str, ofm_dtype: str, ) -> te.Tensor: """A compute operator representing the capabilities of binary_elementwise for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). ifm2 : te.Tensor The Input Feature Map tensor 2 (IFM2). lut : te.Tensor The look-up table values to use if activation = "LUT". operator_type: str The type of the binary elementwise operator. "ADD" "SUB" "MUL" "MIN" "MAX" "SHR" "SHL" ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. ifm2_scale : float The quantization scale for the Input Feature Map tensor 2. ifm2_zero_point : int The quantization zero point for the Input Feature Map tensor 1. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. ifm_channels : int The number of the Input Feature Map channels. ifm2_channels : int The number of the Input Feature Map 2 channels. reversed_operands : bool True if IFM2 is the first operand and IFM is the second operand. activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. Available activations for activation type: {int8, uint8}: "NONE", "CLIP", "TANH", "SIGMOID", "LUT" {int32}: "NONE" clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. ifm_layout : str, optional The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ifm2_layout : str, optional The layout of the Input Feature Map tensor 2. Can be "NHWC" or "NHCWB16". ofm_layout : str, optional The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_dtype: str The Output Feature Map tensor type. MUL, ADD, SUB {IFM}->{OFM}: {uint8, int8 int32} -> {uint8, int8, int32}, any pairing MAX, MIN: IFM and OFM must be of the same type, one of: {int8, uint8} SHR {IFM}->{OFM}: {int32}->{int8, uint8, int32}, any pairing" SHL: {int32}->{int32} only Returns ------- te.Tensor The Output Feature Map tensor. """ assert ifm.shape[0] == 1 assert ifm2.shape[0] == 1 assert ifm_layout in {"NHWC", "NHCWB16"} assert ifm2_layout in {"NHWC", "NHCWB16"} assert ofm_layout in {"NHWC", "NHCWB16"} # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute( ifm, ifm_layout, ifm_zero_point, ifm_scale, ifm_channels, (0, 0, 0, 0) ) dmaed_ifm2 = dma_ifm_compute( ifm2, ifm2_layout, ifm2_zero_point, ifm2_scale, ifm2_channels, (0, 0, 0, 0) ) # Binary elementwise compute operation ofm_height = dmaed_ifm.shape[1] ofm_width = dmaed_ifm.shape[2] binary_elementwise_attrs = { "op": "ethosu_binary_elementwise", "operator_type": operator_type, "reversed_operands": reversed_operands, "activation": activation, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, } operators = { "ADD": operator.add, "SUB": operator.sub, "MUL": operator.mul, "MIN": te.min, "MAX": te.max, "SHR": operator.add, "SHL": operator.add, } broadcast = [value == 1 for value in dmaed_ifm2.shape] if reversed_operands: binary_elementwise = te.compute( (1, ofm_height, ofm_width, ifm_channels), lambda nn, hh, ww, cc: operators[operator_type]( dmaed_ifm2( 0 if broadcast[0] else nn, 0 if broadcast[1] else hh, 0 if broadcast[2] else ww, 0 if broadcast[3] else cc, ).astype(ifm.dtype), dmaed_ifm(nn, hh, ww, cc).astype(ifm.dtype), ).astype(ofm_dtype), name="ethosu_binary_elementwise", attrs=binary_elementwise_attrs, ) else: binary_elementwise = te.compute( (1, ofm_height, ofm_width, ifm_channels), lambda nn, hh, ww, cc: operators[operator_type]( dmaed_ifm(nn, hh, ww, cc).astype(ifm.dtype), dmaed_ifm2( 0 if broadcast[0] else nn, 0 if broadcast[1] else hh, 0 if broadcast[2] else ww, 0 if broadcast[3] else cc, ).astype(ifm.dtype), ).astype(ofm_dtype), name="ethosu_binary_elementwise", attrs=binary_elementwise_attrs, ) nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] nhcwb16_to_nhwc = [ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 16, 0, 1, -16], [0, 0, 0, 0, 0, 1], ] ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], ] ifm2_matrix = [ [1, 0, 0, 0, 0], [0, (1 - int(broadcast[1])), 0, 0, int(broadcast[1])], [0, 0, (1 - int(broadcast[2])), 0, int(broadcast[2])], [0, 0, 0, (1 - int(broadcast[3])), int(broadcast[3])], [0, 0, 0, 0, 1], ] if ofm_layout == "NHCWB16": ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() ifm2_matrix = np.matmul(ifm2_matrix, nhcwb16_to_nhwc).tolist() if ifm_layout == "NHCWB16": ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() if ifm2_layout == "NHCWB16": ifm2_matrix = np.matmul(nhwc_to_nhcwb16, ifm2_matrix).tolist() ifm_propagator = Propagator( ifm_matrix, [0, 0, 0, 0] if ifm_layout == "NHWC" else [0, 0, 0, 0, 0], ) ifm2_propagator = Propagator( ifm2_matrix, [0, 0, 0, 0] if ifm2_layout == "NHWC" else [0, 0, 0, 0, 0], ) propagator_attrs = { "ifm_propagator": ifm_propagator, "ifm2_propagator": ifm2_propagator, } # Compute operation for the OFM DMA pipeline return dma_ofm_compute( binary_elementwise, ofm_layout, ofm_zero_point, ofm_scale, ifm_channels, attrs=propagator_attrs, )
assert list(propagator.offset) == offset for i, row in enumerate(transform): for j, value in enumerate(row): assert isclose(propagator.transform[i][j], value) @pytest.mark.parametrize( ["propagator", "input_stripe_config", "output_stripe_config"], [ ( Propagator( transform=[ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ], offset=[0, 0, 0, 0, 0], ), StripeConfig( shape=[1, 12, 14, 36], extent=[1, 24, 18, 72], strides=[1, 12, 14, 36], order=[1, 2, 3, 4], stripes=[1, 2, 2, 2], offset=[0, 0, 0, 0], ), StripeConfig( shape=[1, 12, 3, 14, 16],
def identity_compute( ifm: te.Tensor, lut: te.Tensor, ifm_scale: float, ifm_zero_point: int, ofm_scale: float, ofm_zero_point: int, activation: str, ) -> te.Tensor: """A compute operator for the NPU identity operator. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). lut : te.Tensor The look-up table values to use if activation is "LUT", "TANH" or "SIGMOID". ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. activation : str The activation function to use. "NONE" - no activation function. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. Returns ------- te.Tensor The Output Feature Map tensor. """ dmaed_ifm = read_compute(ifm, ifm_zero_point, ifm_scale) id_attrs = {"op": "ethosu_identity", "activation": activation} has_lut = activation in ("TANH", "LUT", "SIGMOID") # This is a trick to insert the LUT tensor into the TE graph if LUT is present lut_expr = (lut[0] + lut[255]).astype(ifm.dtype) if has_lut else 0 # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT if has_lut: id_attrs["lut"] = lut identity = te.compute( ifm.shape, lambda *i: (dmaed_ifm(*i) + lut_expr).astype(ifm.dtype), name="ethosu_identity", attrs=id_attrs, ) length = len(ifm.shape) ifm_matrix = np.identity(length + 1) offset = np.zeros(length, dtype="int64") ifm_propagator = Propagator( ifm_matrix, offset.tolist(), ) propagator_attrs = { "ifm_propagator": ifm_propagator, } return write_compute(identity, ofm_zero_point, ofm_scale, attrs=propagator_attrs)