def _create_npu_op_pooling(serial_pooling: spec.SerialPooling): pooling_type = serial_pooling.pooling_type if pooling_type == "AVG": npu_pooling_op = vapi.NpuPoolingOp.AVERAGE elif pooling_type == "MAX": npu_pooling_op = vapi.NpuPoolingOp.MAX npu_pooling_op = vapi.NpuPoolingOperation(npu_pooling_op) npu_pooling_op.ifm = _create_npu_feature_map(serial_pooling.ifm) npu_pooling_op.ofm = _create_npu_feature_map(serial_pooling.ofm) npu_pooling_op.kernel = _create_npu_kernel(serial_pooling.pool_shape) npu_pooling_op.padding = _create_npu_padding(serial_pooling.padding) npu_pooling_op.activation = _create_npu_activation( serial_pooling.activation) if (npu_pooling_op.activation and npu_pooling_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU): _convert_clip_bounds(npu_pooling_op) npu_pooling_op.rounding_mode = _create_npu_rounding_mode( serial_pooling.rounding_mode) npu_pooling_op.ifm_upscale = _create_npu_resampling_mode( serial_pooling.upscale) npu_pooling_op.block_config = _create_npu_block_config( serial_pooling.block_config) if not npu_pooling_op.block_config: target_accel_config = vela_api.get_accelerator_config() block_config = vela_api.get_optimal_block_config( npu_pooling_op, target_accel_config) npu_pooling_op.block_config = block_config return npu_pooling_op
def _create_npu_op_depthwise_conv2d(serial_2d_depthwise): npu_depthwise_conv2d_op = vapi.NpuConvDepthWiseOperation() npu_depthwise_conv2d_op.ifm = _create_npu_feature_map(serial_2d_depthwise.ifm) npu_depthwise_conv2d_op.ofm = _create_npu_feature_map(serial_2d_depthwise.ofm) npu_depthwise_conv2d_op.kernel = _create_npu_kernel(serial_2d_depthwise.kernel) npu_depthwise_conv2d_op.weights = [_create_npu_address_range(serial_2d_depthwise.weight)] weights_zero_point = np.int64(serial_2d_depthwise.weight_zero_point.value) npu_depthwise_conv2d_op.biases = [_create_npu_address_range(serial_2d_depthwise.scale_bias)] npu_depthwise_conv2d_op.padding = _create_npu_padding(serial_2d_depthwise.padding) npu_depthwise_conv2d_op.activation = _create_npu_activation(serial_2d_depthwise.activation) if ( npu_depthwise_conv2d_op.activation and npu_depthwise_conv2d_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU ): _convert_clip_bounds(npu_depthwise_conv2d_op) npu_depthwise_conv2d_op.rounding_mode = _create_npu_rounding_mode( serial_2d_depthwise.rounding_mode ) npu_depthwise_conv2d_op.ifm_upscale = _create_npu_resampling_mode(serial_2d_depthwise.upscale) npu_depthwise_conv2d_op.block_config = _create_npu_block_config( serial_2d_depthwise.block_config ) if not npu_depthwise_conv2d_op.block_config: target_accel_config = vela_api.get_accelerator_config() block_config = vela_api.get_optimal_block_config( npu_depthwise_conv2d_op, target_accel_config ) npu_depthwise_conv2d_op.block_config = block_config return npu_depthwise_conv2d_op, weights_zero_point
def _create_npu_op_conv2d( serial_2d_convolution: spec.Serial2DConvolution, ) -> Tuple[vapi.NpuConv2DOperation, int]: """This is a helper function to capture a list of arguments to create Vela NpuConv2DOperation object. """ has_two_weights = serial_2d_convolution.weight2.address != -1 has_two_biases = serial_2d_convolution.scale_bias2.address != -1 npu_conv2d_op = vapi.NpuConv2DOperation() npu_conv2d_op.ifm = _create_npu_feature_map(serial_2d_convolution.ifm) npu_conv2d_op.ofm = _create_npu_feature_map(serial_2d_convolution.ofm) npu_conv2d_op.kernel = _create_npu_kernel(serial_2d_convolution.kernel) npu_conv2d_op.weights = ([ _create_npu_address_range(serial_2d_convolution.weight), _create_npu_address_range(serial_2d_convolution.weight2), ] if has_two_weights else [ _create_npu_address_range(serial_2d_convolution.weight) ]) weights_zero_point = np.int64( serial_2d_convolution.weight_zero_point.value) npu_conv2d_op.biases = ([ _create_npu_address_range(serial_2d_convolution.scale_bias), _create_npu_address_range(serial_2d_convolution.scale_bias2), ] if has_two_biases else [ _create_npu_address_range(serial_2d_convolution.scale_bias) ]) npu_conv2d_op.padding = _create_npu_padding(serial_2d_convolution.padding) npu_conv2d_op.activation = _create_npu_activation( serial_2d_convolution.activation) if (npu_conv2d_op.activation and npu_conv2d_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU): _convert_clip_bounds(npu_conv2d_op) npu_conv2d_op.rounding_mode = _create_npu_rounding_mode( serial_2d_convolution.rounding_mode) npu_conv2d_op.ifm_upscale = _create_npu_resampling_mode( serial_2d_convolution.upscale) weights_shape_ohwi = [ npu_conv2d_op.ofm.shape.depth, npu_conv2d_op.kernel.height, npu_conv2d_op.kernel.width, npu_conv2d_op.ifm.shape.depth, ] npu_conv2d_op.block_traversal = vela_api.calculate_block_traversal_mode( is_depthwise=False, weights_shape_ohwi=weights_shape_ohwi, ifm_bitdepth=npu_conv2d_op.ifm.data_type.size_in_bits(), ) npu_conv2d_op.block_config = _create_npu_block_config( serial_2d_convolution.block_config) if not npu_conv2d_op.block_config: target_accel_config = vela_api.get_accelerator_config() block_config = vela_api.get_optimal_block_config( npu_conv2d_op, target_accel_config) npu_conv2d_op.block_config = block_config return npu_conv2d_op, weights_zero_point
def _create_npu_op_unary_elementwise(serial_unary_elementwise): operator_type = serial_unary_elementwise.operator_type if operator_type == "ABS": op = vapi.NpuElementWiseOp.ABS if operator_type == "CLZ": op = vapi.NpuElementWiseOp.CLZ npu_unary_elementwise_op = vapi.NpuElementWiseOperation(op) npu_unary_elementwise_op.ifm = _create_npu_feature_map( serial_unary_elementwise.ifm) npu_unary_elementwise_op.ofm = _create_npu_feature_map( serial_unary_elementwise.ofm) npu_unary_elementwise_op.activation = _create_npu_activation( serial_unary_elementwise.activation) if (npu_unary_elementwise_op.activation and npu_unary_elementwise_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU): _convert_clip_bounds(npu_unary_elementwise_op) npu_unary_elementwise_op.rounding_mode = _create_npu_rounding_mode( serial_unary_elementwise.rounding_mode) npu_unary_elementwise_op.block_config = _create_npu_block_config( serial_unary_elementwise.block_config) if not npu_unary_elementwise_op.block_config: target_accel_type = vela_api.get_accelerator_config() block_config = vela_api.get_optimal_block_config( npu_unary_elementwise_op, target_accel_type) npu_unary_elementwise_op.block_config = block_config return npu_unary_elementwise_op
def translate(tir_module, params): """This will take an tir module for the NPU and compile to command stream Parameters ---------- tir_module : tvm.IRModule The TIR module containing ethosu extern calls params : dict A dictionary containing TIR primfunc argument ordering idx to constant NDArray map accel_type : ethosu.vela.api.NpuAccelerator the accelerator variant the tir module needs to compiled to Returns ------- cs : str An hex string of the bytes of command stream encoded_constants : str An hex string of the bytes that includes concat'd encoded weights, encoded biases and scales. base_addresses : List[util.BaseAddress] base addresses to be used by the driver """ # The NPU has 6 usable regions ranging from 0-6 # The regions 0, 3, and 4 is already used for input, # output and constant, respectively (See _get_regions()). # Thus, for scratch we are left with 5, 2 and 1. candidate_regions_for_scratch = [5, 2, 1] ( scratch_region_map, dynamic_allocation_size, dynamic_allocation_region, ) = analyze_scratch_memory_acesses(tir_module, candidate_regions_for_scratch) buffer_info = extract_buffer_info(tir_module, params) call_extern_list = extract_call_extern_list(tir_module) _npu_ops = list() for call_extern in call_extern_list: _npu_ops.append(translate_ethosu_tir_call_extern(call_extern)) _npu_ops, constant_data = assign_addresses(buffer_info, _npu_ops, scratch_region_map) base_addresses = extract_param_base_addresses(tir_module, buffer_info, scratch_region_map) if dynamic_allocation_size: base_addresses.append( util.BaseAddress( name="dynamic_allocation", primfunc_param_idx=None, region=dynamic_allocation_region, size=dynamic_allocation_size, is_runtime_allocation=True, )) target_accel_config = vela_api.get_accelerator_config() cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config) payload = vapi.npu_create_driver_payload(cmds, target_accel_config) return payload.hex(), constant_data, base_addresses
def _create_npu_op_binary_elementwise(serial_binary_elementwise: spec.SerialBinaryElementwise): operator_type = serial_binary_elementwise.operator_type if operator_type == "ADD": op = vapi.NpuElementWiseOp.ADD elif operator_type == "SUB": op = vapi.NpuElementWiseOp.SUB elif operator_type == "MUL": op = vapi.NpuElementWiseOp.MUL elif operator_type == "MIN": op = vapi.NpuElementWiseOp.MIN elif operator_type == "MAX": op = vapi.NpuElementWiseOp.MAX elif operator_type == "SHR": op = vapi.NpuElementWiseOp.SHR elif operator_type == "SHL": op = vapi.NpuElementWiseOp.SHL npu_binary_elementwise_op = vapi.NpuElementWiseOperation(op) npu_binary_elementwise_op.ifm = _create_npu_feature_map(serial_binary_elementwise.ifm) npu_binary_elementwise_op.ifm2 = _create_npu_feature_map(serial_binary_elementwise.ifm2) npu_binary_elementwise_op.ofm = _create_npu_feature_map(serial_binary_elementwise.ofm) npu_binary_elementwise_op.reversed_operands = serial_binary_elementwise.reversed_operands npu_binary_elementwise_op.activation = _create_npu_activation( serial_binary_elementwise.activation ) if ( npu_binary_elementwise_op.activation and npu_binary_elementwise_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU ): _convert_clip_bounds(npu_binary_elementwise_op) npu_binary_elementwise_op.rounding_mode = _create_npu_rounding_mode( serial_binary_elementwise.rounding_mode ) npu_binary_elementwise_op.block_config = _create_npu_block_config( serial_binary_elementwise.block_config ) if not npu_binary_elementwise_op.block_config: target_accel_config = vela_api.get_accelerator_config() block_config = vela_api.get_optimal_block_config( npu_binary_elementwise_op, target_accel_config ) npu_binary_elementwise_op.block_config = block_config return npu_binary_elementwise_op
def translate(tir_module, params): """This will take an tir module for the NPU and compile to command stream Parameters ---------- tir_module : tvm.IRModule The TIR module containing ethosu extern calls params : dict A dictionary containing TIR primfunc argument ordering idx to constant NDArray map accel_type : ethosu.vela.api.NpuAccelerator the accelerator variant the tir module needs to compiled to Returns ------- cs : str An hex string of the bytes of command stream encoded_constants : str An hex string of the bytes that includes concat'd encoded weights, encoded biases and scales. base_addresses : List[util.BaseAddress] base addresses to be used by the driver """ buffer_info = extract_buffer_info(tir_module, params) call_extern_list = extract_call_extern_list(tir_module) _npu_ops = list() for call_extern in call_extern_list: _npu_ops.append(translate_ethosu_tir_call_extern(call_extern)) _npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops) base_addresses = extract_param_base_addresses(tir_module, buffer_info) if scratch_size > 0: base_addresses.append( util.BaseAddress( "scratch", None, _REGION_MAP[BufferType.scratch], scratch_size, True, ) ) target_accel_config = vela_api.get_accelerator_config() cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config) payload = vapi.npu_create_driver_payload(cmds, target_accel_config) return payload.hex(), constant_data, base_addresses
def _visit(stmt): new_args = [] # We don't want to divide the constant that will be executed on two cores in parallel is_u65_conv2d = (vela_api.get_accelerator_config() == vapi.NpuAccelerator.Ethos_U65_512 and stmt.args[0] == "ethosu_conv2d") for i, arg in enumerate(stmt.args): if isinstance(arg, tvm.tir.expr.BufferLoad): # If we're trying to load a buffer that maps to a constant if arg.buffer.data in buffer_to_const: const = buffer_to_const[arg.buffer.data] flattened_const_shape = np.prod(const.shape) offset = int(arg.indices[0]) # Note by convention the arg after a constant read is the length of the read length = int(stmt.args[i + 1]) # If it's anything other than a full read, create a new buffer if (offset != 0 or flattened_const_shape != length ) and not is_u65_conv2d: out_channels = const.shape[0] offset_channels = int( (offset * out_channels) / flattened_const_shape) length_channels = int( (length * out_channels) / flattened_const_shape) # split the constant up across channels split_const = np.split(const, out_channels, axis=0) # create a new const out of the channels we want to keep new_const = np.concatenate( split_const[offset_channels:offset_channels + length_channels], axis=0) new_consts.append(new_const) new_buffer = tvm.tir.decl_buffer( (length, ), arg.dtype, scope=arg.buffer.scope()) new_buffers.append(new_buffer) new_args.append( tvm.tir.expr.BufferLoad(new_buffer, [0])) continue keep_buffers.add(arg.buffer.data) new_args.append(arg) return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span)
def translate(tir_module, params): """This will take an tir module for the NPU and compile to command stream Parameters ---------- tir_module : tvm.IRModule The TIR module containing ethosu extern calls params : dict A dictionary containing TIR primfunc argument ordering idx to constant NDArray map accel_type : ethosu.vela.api.NpuAccelerator the accelerator variant the tir module needs to compiled to Returns ------- cs : str An hex string of the bytes of command stream encoded_constants : str An hex string of the bytes that includes concat'd encoded weights, encoded biases and scales. scratch_size : int The size of the scratch buffer needed. """ buffer_info = extract_buffer_info(tir_module, params) call_extern_list = extract_call_extern_list(tir_module) _npu_ops = list() for call_extern in call_extern_list: _npu_ops.append(translate_ethosu_tir_call_extern(call_extern)) _npu_ops, constant_tensor, scratch_size = assign_addresses( buffer_info, _npu_ops) target_accel_config = vela_api.get_accelerator_config() cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config) payload = vapi.npu_create_driver_payload(cmds, target_accel_config) hex_value = "" if constant_tensor is None else constant_tensor.tobytes( ).hex() return payload.hex(), hex_value, scratch_size
def EncodeConstants(const_dict): """the NPU requires that weights are compressed and bias/scales are 'encoded', both of which are performed by this pass. This pass modifies both the constant dict to contain the post-encoding values of the constants and the IR to adjust buffer types/sizes/accesses so they align with the encoded constants. Calls to the Vela API are made to perform the actual compression/ encoding. """ new_const_dict = {} buffer_to_const = {} pointer_to_buffer = {} rewrite_buffer = {} rewrite_pointer = {} accel_config = vela_api.get_accelerator_config() def _align_scale_bias(tir_extern_call, bias): """Align the scale_bias to 16 bytes.""" value_bytes = bytearray() value_bytes.extend(bias.tobytes()) # Align to 16 remainder = (len(value_bytes)) % 16 if remainder > 0: value_bytes.extend(bytearray(16 - remainder)) value = np.frombuffer(value_bytes, dtype="uint8") return value def _encode_weights(tir_extern_call, weights): """Encode the weights for a TIR extern call.""" value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_config) value = np.frombuffer(value_bytes, dtype="uint8") return value def _new_buffer(old_buffer, new_value): """Create a new buffer and add the old buffer and its pointer to the rewriting maps.""" new_buffer = tvm.tir.decl_buffer((len(new_value),), str(new_value.dtype)) pointer_to_buffer[new_buffer.data] = new_buffer rewrite_buffer[old_buffer] = new_buffer rewrite_pointer[old_buffer.data] = new_buffer.data buffer_to_const[new_buffer] = new_value def _visit_encode_pre(stmt): if isinstance(stmt, tvm.tir.Call): # Handle copies as a special-case by propagating the buffer information # from the read to the write pointer. if stmt.args[0] == "ethosu_copy": read_pointer = stmt.args[1].buffer_var if read_pointer in pointer_to_buffer: write_pointer = stmt.args[3].buffer_var # Assert writing to the base of the write_var (pre-StorageRewrite) assert stmt.args[3].index == 0 assert stmt.args[1].index == 0 pointer_to_buffer[write_pointer] = pointer_to_buffer[read_pointer] else: # Encode the weights weights_pointer = get_weights_pointer(stmt) if weights_pointer is not None: assert weights_pointer in pointer_to_buffer weights_buffer = pointer_to_buffer[weights_pointer] weights_value = buffer_to_const[weights_buffer] new_weights_value = _encode_weights(stmt, weights_value) _new_buffer(weights_buffer, new_weights_value) # Align the scale_bias to 16 bytes scale_bias_pointer = get_scale_bias_pointer(stmt) if scale_bias_pointer is not None: assert scale_bias_pointer in pointer_to_buffer scale_bias_buffer = pointer_to_buffer[scale_bias_pointer] scale_bias_value = buffer_to_const[scale_bias_buffer] new_scale_bias_value = _align_scale_bias(stmt, scale_bias_value) _new_buffer(scale_bias_buffer, new_scale_bias_value) def _visit_encode_post(stmt): # Because encoding may change the data type (e.g. bias to uint8) and type information # is stored in pointer vars, it's necessary to rewrite all the pointers which point # to encoded data. if isinstance(stmt, tvm.tir.Allocate): allocate_pointer = stmt.buffer_var if allocate_pointer in pointer_to_buffer: buffer = pointer_to_buffer[allocate_pointer] if buffer in rewrite_buffer: # If the pointer needs rewriting # Create a new pointer var with the type of the new buffer new_buffer = rewrite_buffer[buffer] storage_type = tvm.ir.PrimType(new_buffer.dtype) new_pointer = tvm.tir.Var( allocate_pointer.name, tvm.ir.PointerType(storage_type, buffer.scope()), allocate_pointer.span, ) # Set the new pointer to resolve to the new buffer pointer_to_buffer[new_pointer] = new_buffer # Add the old pointer to the pointer rewriting dict rewrite_pointer[allocate_pointer] = new_pointer def _visit_rewrite(stmt): if isinstance(stmt, tvm.tir.Call): # For extern calls, we need to rewrite pairs of arguments corresponding to # base address load and the length of the load. new_args = [stmt.args[0]] new_buffers = rewrite_buffer.values() for i in range(1, len(stmt.args)): # If the previous argument was a load, the current should be a length if isinstance(stmt.args[i - 1], tvm.tir.Load): load = stmt.args[i - 1] pointer = load.buffer_var if pointer in pointer_to_buffer: buffer = pointer_to_buffer[pointer] # Only rewrite the arguments of buffers that have been encoded if buffer in new_buffers: new_arg = np.prod(list(pointer_to_buffer[pointer].shape)) new_args.append(new_arg) continue new_args.append(stmt.args[i]) return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span) if isinstance(stmt, tvm.tir.Allocate): # Where a pointer needs rewriting, the allocate for it must be rewritten allocate_pointer = stmt.buffer_var if allocate_pointer in pointer_to_buffer: if pointer_to_buffer[allocate_pointer] in rewrite_buffer: new_buffer = rewrite_buffer[pointer_to_buffer[allocate_pointer]] new_pointer = rewrite_pointer[allocate_pointer] return tvm.tir.Allocate( new_pointer, new_buffer.dtype, new_buffer.shape, stmt.condition, stmt.body, stmt.span, ) # The following rewrites would be better expressed by just rewriting the Vars, however # ir_transform doesn't seem to visit Vars. So instead we do the next best thing and rewrite # the nodes which contain the Vars. if isinstance(stmt, tvm.tir.Load): load_pointer = stmt.buffer_var if load_pointer in rewrite_pointer: new_pointer = rewrite_pointer[load_pointer] element_type = new_pointer.type_annotation.element_type.dtype return tvm.tir.Load( element_type, new_pointer, stmt.index, stmt.predicate, stmt.span ) if isinstance(stmt, tvm.tir.AttrStmt): node_pointer = stmt.node if node_pointer in rewrite_pointer: return tvm.tir.AttrStmt( rewrite_pointer[node_pointer], stmt.attr_key, stmt.value, stmt.body, stmt.span ) return None def _ftransform(f, mod, ctx): for i, param in enumerate(f.params): if i in const_dict: buffer_to_const[f.buffer_map[param]] = const_dict[i].flatten() pointer_to_buffer[f.buffer_map[param].data] = f.buffer_map[param] # First analyse what needs to be rewritten new_body = tvm.tir.stmt_functor.ir_transform( f.body, _visit_encode_pre, _visit_encode_post, ["tir.Call", "tir.Allocate"] ) # Then perform the rewrites new_body = tvm.tir.stmt_functor.ir_transform( f.body, None, _visit_rewrite, ["tir.Call", "tir.Allocate", "tir.Load", "tir.AttrStmt"] ) new_buffer_map = {} # Rewrite the buffer map and const dict to instead use the encoded versions for i, param in enumerate(f.params): buffer = f.buffer_map[param] if buffer in rewrite_buffer: new_buffer = rewrite_buffer[buffer] new_buffer_map[param] = new_buffer new_value = buffer_to_const[new_buffer] new_const_dict[i] = new_value elif buffer in buffer_to_const: new_const_dict[i] = buffer_to_const[buffer] new_buffer_map[param] = buffer else: new_buffer_map[param] = buffer new_f = tvm.tir.PrimFunc(f.params, new_body, f.ret_type, new_buffer_map, f.attrs, f.span) return new_f def _encode_constants(mod): mod, divided_const_dict = DivideConstants(const_dict)(mod) const_dict.clear() for key, value in divided_const_dict.items(): const_dict[key] = value transform_func = tvm.tir.transform.prim_func_pass( _ftransform, opt_level=0, name="tir.ethosu.encode_constants" ) new_func = transform_func(mod) return new_func, new_const_dict return _encode_constants
def collect_encoding_definitions(stmt, old_buffer_to_const): # Map from copy destination to copy source. copy_map = {} # List of buffer copies that occurred copied_buffers = [] # List of encoded buffer information constant_buffer_replacements = [] def _align_scale_bias(tir_extern_call, bias): """Align the scale_bias to 16 bytes.""" value_bytes = bytearray() value_bytes.extend(bias.tobytes()) # Align to 16 remainder = (len(value_bytes)) % 16 if remainder > 0: value_bytes.extend(bytearray(16 - remainder)) value = np.frombuffer(value_bytes, dtype="uint8") return value accel_config = vela_api.get_accelerator_config() def _encode_weights(tir_extern_call, weights): """Encode the weights for a TIR extern call.""" value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_config) value = np.frombuffer(value_bytes, dtype="uint8") return value def _declare_constant_buffer(old_buffer, encoded_constants): """Create a new buffer and add the old buffer and its pointer to the rewriting maps.""" new_buffer = tvm.tir.decl_buffer( shape=[len(encoded_constants)], dtype=str(encoded_constants.dtype), name=old_buffer.name + "_encoded", scope=old_buffer.scope(), ) constant_buffer_replacements.append({ "old_buffer": old_buffer, "new_buffer": new_buffer, "encoded_constants": encoded_constants, }) def _visit(stmt): if isinstance(stmt, tvm.tir.Call): # Handle copies as a special-case by propagating the buffer information # from the read to the write pointer. if stmt.args[0] == "ethosu_copy": read_buffer = stmt.args[1].buffer write_buffer = stmt.args[3].buffer # Assert writing to the base of the write_var (pre-StorageRewrite) assert list(stmt.args[3].indices) == [0] assert list(stmt.args[1].indices) == [0] copied_buffers.append({ "source": read_buffer, "dest": write_buffer }) copy_map[write_buffer] = read_buffer else: # Encode the weights weights_buffer = get_weights_buffer(stmt) if weights_buffer is not None: if weights_buffer in copy_map: weights_buffer = copy_map[weights_buffer] unencoded_weights_value = old_buffer_to_const[ weights_buffer] encoded_weights_value = _encode_weights( stmt, unencoded_weights_value) _declare_constant_buffer(weights_buffer, encoded_weights_value) # Align the scale_bias to 16 bytes scale_bias_buffer = get_scale_bias_buffer(stmt) if scale_bias_buffer is not None: if scale_bias_buffer in copy_map: scale_bias_buffer = copy_map[scale_bias_buffer] scale_bias_value = old_buffer_to_const[ scale_bias_buffer] aligned_scale_bias_value = _align_scale_bias( stmt, scale_bias_value) _declare_constant_buffer(scale_bias_buffer, aligned_scale_bias_value) tvm.tir.stmt_functor.post_order_visit(stmt, _visit) return { "copied_buffers": copied_buffers, "constant_buffer_replacements": constant_buffer_replacements, }
def collect_encoding_definitions(stmt, old_buffer_to_const): # Map from copy destination to copy source. copy_map = {} # List of buffer copies that occurred copied_buffers = [] # List of encoded buffer information constant_buffer_replacements = [] def _align_scale_bias(tir_extern_call, bias): """Align the scale_bias to 16 bytes.""" value_bytes = bytearray() value_bytes.extend(bias.tobytes()) # Align to 16 remainder = (len(value_bytes)) % 16 if remainder > 0: value_bytes.extend(bytearray(16 - remainder)) value = np.frombuffer(value_bytes, dtype="uint8") return value accel_config = vela_api.get_accelerator_config() def _encode_weights(tir_extern_call, weights): """Encode the weights for a TIR extern call.""" value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_config) value = np.frombuffer(value_bytes, dtype="uint8") return value def _declare_constant_buffer(old_buffer, encoded_constants, split_idx): """Create a new buffer and add the old buffer and its pointer to the rewriting maps.""" new_buffer = tvm.tir.decl_buffer( shape=[len(encoded_constants)], dtype=str(encoded_constants.dtype), name=old_buffer.name + "_encoded", scope=old_buffer.scope(), ) constant_buffer_replacements.append({ "old_buffer": old_buffer, "new_buffer": new_buffer, "encoded_constants": encoded_constants, "split_idx": split_idx, }) def _encode_weights_or_bias(buffer1, buffer2, stmt, encode_func): """Encode the weights or align the bias either for one or two cores, depending on the variant.""" constant = old_buffer_to_const[buffer1] # If we have just one core, encode the whole constant if buffer2 is None: new_const = encode_func(stmt, constant) return new_const, None # Assume that the constant tensor has not been flattened yet assert len(constant.shape) != 1 channels = constant.shape[0] split_const = np.split(constant, channels, axis=0) const_list = [ split_const[i] for i in range(channels) if i % 2 == 0 ] const_to_encode = np.concatenate(const_list, axis=0) new_const = encode_func(stmt, const_to_encode) split_idx = len(new_const) # Encode half of the constant separately for the other core if it exists assert buffer1.same_as(buffer2) const2_list = [ split_const[i] for i in range(channels) if i % 2 == 1 ] const2_to_encode = np.concatenate(const2_list, axis=0) new_const2 = encode_func(stmt, const2_to_encode) new_const = np.append(new_const, new_const2).astype("uint8") return new_const, split_idx def _visit(stmt): if isinstance(stmt, tvm.tir.Call): op = str(stmt.args[0].value) # Handle copies as a special-case by propagating the buffer information # from the read to the write pointer. if op == "ethosu_copy": read_buffer = stmt.args[1].buffer write_buffer = stmt.args[3].buffer # Assert writing to the base of the write_var (pre-StorageRewrite) assert list(stmt.args[3].indices) == [0] assert list(stmt.args[1].indices) == [0] copied_buffers.append({ "source": read_buffer, "dest": write_buffer }) copy_map[write_buffer] = read_buffer ops_with_weights = { "ethosu_conv2d": tirtocs.translate_ethosu_conv2d, "ethosu_depthwise_conv2d": tirtocs.translate_ethosu_depthwise_conv2d, } if op in ops_with_weights: npu_op, _ = ops_with_weights[op](stmt) # Encode the weights weights_buffer = npu_op.weights[0].address.buffer if weights_buffer in copy_map: weights_buffer = copy_map[weights_buffer] # In case of U65 512 mac variant the weights are split across two cores # and need to be encoded separately weights2_buffer = ( npu_op.weights[1].address.buffer if accel_config == vapi.NpuAccelerator.Ethos_U65_512 else None) if weights2_buffer in copy_map: weights2_buffer = copy_map[weights2_buffer] new_weights, split_idx = _encode_weights_or_bias( weights_buffer, weights2_buffer, stmt, _encode_weights) _declare_constant_buffer(weights_buffer, new_weights, split_idx) # Align the scale_bias to 16 bytes scale_bias_buffer = npu_op.biases[0].address.buffer if scale_bias_buffer in copy_map: scale_bias_buffer = copy_map[scale_bias_buffer] scale_bias2_buffer = ( npu_op.biases[1].address.buffer if accel_config == vapi.NpuAccelerator.Ethos_U65_512 else None) if scale_bias2_buffer in copy_map: scale_bias2_buffer = copy_map[scale_bias2_buffer] new_scale_bias, split_idx = _encode_weights_or_bias( scale_bias_buffer, scale_bias2_buffer, stmt, _align_scale_bias) _declare_constant_buffer(scale_bias_buffer, new_scale_bias, split_idx) tvm.tir.stmt_functor.post_order_visit(stmt, _visit) return { "copied_buffers": copied_buffers, "constant_buffer_replacements": constant_buffer_replacements, }