def load( checkpoint_file, arch, fc_layer, quantization, bias_quantization, output_shift, kernel_size, operator, verbose=False, no_bias=None, conv_groups=None, ): """ Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is `True`, configure a single fully connected classification layer for software rather than hardware. `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84). This value is checked against the weight inputs. `bias_quantization` is a list of the expected bit widths for the layer weights (always 8 for AI84/AI85). In addition to returning weights anf biases, this function configures the network output channels and the number of layers. When `verbose` is set, display the shapes of the weights. """ no_bias = no_bias or [] weights = [] bias = [] fc_weights = [] fc_bias = [] weight_keys = [] bias_keys = [] quant = [] bias_quant = [] weight_min = [] weight_max = [] weight_size = [] bias_min = [] bias_max = [] bias_size = [] checkpoint = torch.load(checkpoint_file, map_location='cpu') print(f'Reading {checkpoint_file} to configure network weights...') if 'state_dict' not in checkpoint or 'arch' not in checkpoint: raise RuntimeError("\nNo `state_dict` or `arch` in checkpoint file.") if arch and checkpoint['arch'].lower() != arch.lower(): eprint( f"Network architecture of configuration file ({arch}) does not match " f"network architecture of checkpoint file ({checkpoint['arch']}).") sys.exit(1) checkpoint_state = checkpoint['state_dict'] layers = 0 num_conv_layers = len(quantization) have_fc_layer = False output_channels = [] input_channels = [] param_count = 0 param_size = 0 error_exit = False seq = 0 for _, k in enumerate(checkpoint_state.keys()): # Skip over non-weight layers while seq < len(operator) and operator[seq] == opn.NONE: seq += 1 operation, parameter = k.rsplit(sep='.', maxsplit=1) if parameter in ['weight']: module, op = k.split(sep='.', maxsplit=1) op = op.rsplit(sep='.', maxsplit=1)[0] if module != 'fc' or module == 'fc' and not fc_layer: if layers >= num_conv_layers or seq >= num_conv_layers: continue w = checkpoint_state[k].numpy().astype(np.int64) w_min, w_max = w.min(), w.max() # Determine quantization or make sure that what was given fits if quantization[seq] is not None: assert w_min >= -(2**(quantization[seq] - 1)) assert w_max < 2**(quantization[seq] - 1) else: if w_max > 0: w_max_m = int(w_max) else: w_max_m = int(abs(w_max)) - 1 if w_min > 0: w_min_m = int(w_min) else: w_min_m = int(abs(w_min)) - 1 quantization[seq] = 1 << ( fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1) assert quantization[seq] <= 8 quant.append(quantization[seq]) weight_min.append(w_min) weight_max.append(w_max) if op == 'conv2d' and operator[seq] == opn.CONVTRANSPOSE2D: # For ConvTranspose2d, flip the weights as follows: w = np.flip(w, axis=(2, 3)).swapaxes(0, 1) mult = conv_groups[ seq] if operator[seq] != opn.CONVTRANSPOSE2D else 1 input_channels.append(w.shape[1] * mult) # Input channels mult = conv_groups[seq] if operator[ seq] == opn.CONVTRANSPOSE2D else 1 output_channels.append(w.shape[0] * mult) # Output channels if len(w.shape) == 2: # MLP if kernel_size[seq][0] != 1 or kernel_size[seq][1] != 1: eprint( f'The `kernel_size` for the MLP layer {seq} should ' f'be set to 1x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True elif len(w.shape) == 3: # 1D if kernel_size[seq][0] != w.shape[2] or kernel_size[seq][ 1] != 1: eprint( f'The `kernel_size` for the 1D layer {seq} should ' f'be set to {w.shape[2]}x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True elif len(w.shape) == 4: # 2D if kernel_size[seq][0] != w.shape[2] \ or kernel_size[seq][1] != w.shape[3]: eprint( f'The `kernel_size` for the 2D layer {seq} should ' f'be set to {w.shape[2]}x{w.shape[3]} instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * quantization[seq] + 7) // 8 weight_size.append(w_size) param_size += w_size if len(w.shape) == 2: # linear - add dummy 'channel' w = np.expand_dims(w, axis=0) else: # conv1d, conv2d, ... - combine input and output channels w = np.reshape(w, (-1, ) + w.shape[2:]) weights.append(w) weight_keys.append(k) # Is there a bias for this layer? bias_name = operation + '.bias' if bias_name in checkpoint_state and seq not in no_bias: w = checkpoint_state[bias_name].numpy(). \ astype(np.int64) // tornadocnn.dev.BIAS_DIV w_min, w_max = w.min(), w.max() assert w_min >= -(2**(bias_quantization[seq] - 1)) assert w_max < 2**(bias_quantization[seq] - 1) bias_min.append(w_min) bias_max.append(w_max) bias.append(w) bias_keys.append(bias_name) bias_quant.append(bias_quantization[seq]) w_count = np.prod(w.shape) param_count += w_count w_size = ( w_count * 8 + (bias_quantization[seq] - 1)) // bias_quantization[seq] bias_size.append(w_size) param_size += w_size else: bias.append(None) bias_min.append(0) bias_max.append(0) bias_keys.append('N/A') bias_quant.append(0) bias_size.append(0) # Not overriding output_shift? if output_shift[seq] is None: output_shift_name = operation.rsplit( sep='.', maxsplit=1)[0] + '.output_shift' # Is there an output_shift for this layer? if output_shift_name in checkpoint_state: w = checkpoint_state[output_shift_name].numpy().astype( np.int64) assert len(w) == 1 output_shift[seq] = w[0] else: output_shift[seq] = 0 # Add implicit shift based on quantization output_shift[seq] += 8 - quantization[seq] layers += 1 seq += 1 elif have_fc_layer: eprint( 'The network cannot have more than one fully connected software layer, ' 'and it must be the output layer.') sys.exit(1) elif fc_layer: w = checkpoint_state[k].numpy().astype(np.int64) assert w.min() >= -128 and w.max() <= 127 fc_weights.append(w) # Is there a bias for this layer? bias_name = operation + '.bias' if bias_name in checkpoint_state: # Do not divide bias for FC w = checkpoint_state[bias_name].numpy().astype(np.int64) assert w.min() >= -128 and w.max() <= 127 fc_bias.append(w) else: fc_bias.append(None) have_fc_layer = True if verbose: print( f'Checkpoint for epoch {checkpoint["epoch"]}, model {checkpoint["arch"]} - ' 'weight and bias data:') print( 'Layer InCh OutCh Weights Quant Shift Min Max Size ' 'Key Bias Quant Min Max Size Key' ) for ll in range(layers): if ll < len(weights) and weights[ll] is not None: weight_shape = str(weights[ll].shape) if bias[ll] is not None: bias_shape = str(bias[ll].shape) else: bias_shape = 'N/A' if output_shift[ll] is not None: output_shift_shape = output_shift[ll] else: output_shift_shape = 'N/A' print( f'{ll:4}: ' f'{input_channels[ll]:5} {output_channels[ll]:5} ' f'{weight_shape:15} ' f'{quant[ll]:5} {output_shift_shape:5} ' f'{weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} ' f'{weight_keys[ll]:35} ' f'{bias_shape:10} ' f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} ' f'{bias_keys[ll]:25}') print( f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes' ) if error_exit: sys.exit(1) return layers, weights, bias, output_shift, \ fc_weights, fc_bias, input_channels, output_channels
def load( dataset, quantization, bias_quantization, # pylint: disable=unused-argument output_shift, cfg_layers, cfg_weights=None, cfg_bias=None, no_bias=None, ): """ Return sample weights. """ no_bias = no_bias or [] weights = [] fc_weights = [] fc_bias = [] output_channels = [] input_channels = [] layers = 0 dataset = dataset.lower() # Load weights saved using: # w = np.random.randint(-128, 127, (2, 64, 64, 3, 3), dtype=np.int8) # np.save(f'tests/{dataset}', w) w = [] layers = 0 if cfg_weights is None: fname = os.path.join('tests', f'weights_{dataset}.npy') else: fname = os.path.join('tests', f'{cfg_weights}.npy') with open(fname, mode='rb') as file: print(f'Reading weights from {fname}...') try: while True: w.append(np.load(file)) layers += 1 except ValueError: pass if layers == 1: # If the weights file wasn't a list w = w[0] layers = w.shape[0] layers = min(layers, cfg_layers) bias = [None] * layers if cfg_bias is not None: ll = 0 fname = os.path.join('tests', f'bias_{cfg_bias}.npy') with open(fname, mode='rb') as file: print(f'Reading bias from {fname}...') try: while ll < layers: if ll not in no_bias: bias[ll] = np.load(file) ll += 1 except ValueError: pass for ll in range(layers): # Set to default? if quantization[ll] is None: quantization[ll] = 8 # Re-quantize if needed (these random sample weights, so no need to round etc.) max_w = int(w[ll].max()) if max_w < 0: max_w += 1 min_w = int(w[ll].min()) if min_w < 0: min_w += 1 current_quant = max(fls(abs(min_w)), fls(abs(max_w))) + 2 if current_quant > 8: # Either way, more than 8 bits is an error raise ValueError( 'ERROR: Weight file includes values larger than 8 bit!') if current_quant > quantization[ll]: w[ll] >>= current_quant - quantization[ll] # Specified output_shift? if output_shift[ll] is None: output_shift[ll] = 0 # Add based on quantization output_shift[ll] += 8 - quantization[ll] output_channels.append(w[ll].shape[0]) # Output channels input_channels.append(w[ll].shape[1]) # Input channels if len(w[ll].shape) == 4: weights.append(w[ll].reshape(-1, w[ll].shape[-2], w[ll].shape[-1])) else: weights.append(w[ll].reshape(-1, w[ll].shape[-1])) return layers, weights, bias, output_shift, \ fc_weights, fc_bias, input_channels, output_channels
def load( # pylint: disable=too-many-branches,too-many-statements verbose, embedded_code, device, apb, start_layer, layers, operator, kernel, kernel_size, quantization, processor_map, output_processor_map, input_chan, output_chan, out_expand, out_expand_thresh, in_expand, in_expand_thresh, flatten=False, mexpress=False, verify=False, riscv_flash=False, quad=False, debug=False, blocklevel=False, legacy_kernels=False, calcx4=False, ): """ Stack `kernel` values and write them to C code (for `embedded_code` if `True` or RTL simulation). The output is written to the `apb` object. Input is configured with `kernel_size`, `quantization`, `layers`, `processor_map`, `output_processor_map`, `input_chan`, `output_chan`, `out_expand` and `out_expand_thresh`. When `mexpress` is `True`, the function uses the memcpy()-friendly hardware functionality to reduce the number of transfers. When `verify` is also true (mexpress mode only), kernels are read back and compared. This function returns the kernel offsets and the kernel lengths for all layers. """ # Kernels: Stack kernels; write only the kernels needed proc_kern_max = [0] * tc.dev.MAX_PROC kern_offs = [0] * layers kern_len = [0] * layers kernel_map = np.full((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE), _INVALID_VALUE, dtype=np.int64) kernels_used = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE), dtype=np.int64) kernel_data = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE, 9), dtype=np.int8) # There are four 32-bit words per 9-byte kernel. # The value map is initialized with zeros so we can later ignore unused entries and use # memcpy() on initialized and uninitialized data. kernel_values = np.zeros( (tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE * _WORDS_PER_KERNEL), dtype=np.int64) if debug: print('\nLoading Kernels...') if calcx4 and not tc.dev.SUPPORT_CALCX4: eprint('--calcx4 is not supported on this device.') sys.exit(1) assert not ( (embedded_code or mexpress) and calcx4) # FIXME Add support later for ll in range(start_layer, layers): if operator[ll] not in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]: kern_len[ll] = 0 kern_offs[ll] = 0 continue if flatten[ll]: kernel_reshaped = kernel[ll].reshape( output_chan[ll] * input_chan[ll], -1, kernel_size[ll][0], kernel_size[ll][1], ) else: kernel_reshaped = kernel[ll] first_proc = ffs(processor_map[ll]) last_proc = fls(processor_map[ll]) ch = 0 m = 0 for p in range(first_proc, last_proc + 1): if (processor_map[ll] >> p) & 1 == 0: # Unused processor continue # Get highest offset for all used processors kern_offs[ll] = max(proc_kern_max[p], kern_offs[ll]) ksize = kernel_size[ll][0] * kernel_size[ll][1] qfactor = 8 // quantization[ll] # Determine the number of kernels that need to be programmed. Since each instance # spans 4 processors, kernels for all instances that have a single processor enabled # need to be written, i.e. round down the first. The last does not need to be rounded # up because hardware takes care of it. next_layer_map = output_processor_map[ll] # When using kernels smaller than 8 bit, round up to the next 8-bit boundary # Gaps are accounted for like any other kernel. kern_len[ll] = 1 + quantization[ll] * \ (fls(next_layer_map) - ffs(next_layer_map)) // 8 # This extends the kernels to the right on AI85 for input and output expansion if output_chan[ll] > tc.dev.MAX_PROC: kern_len[ll] = (kern_len[ll] + tc.dev.P_SHARED - 1) & ~(tc.dev.P_SHARED - 1) kern_len[ll] *= out_expand[ll] * in_expand[ll] if not legacy_kernels and flatten[ll]: kern_len[ll] *= kernel_reshaped.shape[1] kern_len[ll] -= (out_expand[ll] * popcount(next_layer_map) - output_chan[ll]) \ * kernel_reshaped.shape[1] * 8 // (ksize * quantization[ll]) if device != 84: # Pack kernels when using 1D convolutions, or 1x1 kernels kern_len[ll] = (kern_len[ll] * ksize + 8) // 9 if ll == 0 and quad: kern_len[0] = (kern_len[0] + 3) // 4 # We don't have to use dummy columns if there's space available on the left kern_offs[ll] = \ max(0, kern_offs[ll] - (((ffs(next_layer_map) % tc.dev.P_SHARED) + qfactor - 1) // qfactor)) # The kernel offset needs to start at a multiple of 4. kern_offs[ll] = (kern_offs[ll] + tc.dev.P_SHARED - 1) & ~(tc.dev.P_SHARED - 1) if kern_offs[ll] + kern_len[ll] > tc.dev.mask_width(p): eprint( f'\nKernel memory exceeded at layer {ll}; offset: {kern_offs[ll]}, ' f'needed: {kern_len[ll]}.' '\n\nKernel map so far:') print_map(layers, kernel_map, print_fn=eprint_noprefix) sys.exit(1) proc_mask = 2**qfactor - 1 # Start at the first used instance this_map_init = next_layer_map >> ffs(next_layer_map) start_col = ffs( next_layer_map) % tc.dev.P_SHARED # First target column for p in range(first_proc, last_proc + 1): if (processor_map[ll] >> p) & 1 == 0: # Unused source processor continue col_target = start_col for expand in range(out_expand[ll]): this_map = this_map_init if ll == 0 and quad: col = expand * (out_expand_thresh[ll] + 3) // 4 stop_col = col + (out_expand_thresh[ll] + 3) // 4 else: col = expand * out_expand_thresh[ll] stop_col = col + out_expand_thresh[ll] while col < stop_col: # Skip over unused bits in the target processor map # (unused means 1 bit for 8-bit weights, 2 for 4-bit weights, etc.) if this_map != 0: while this_map & proc_mask == 0: assert this_map != 0 col_target += 1 # Completely skip this_map >>= qfactor # and slide forward this_mask = this_map & proc_mask this_map >>= qfactor if ll == 0 and quad: src_offs = ch + (m - p // 16) * input_chan[ll] else: src_offs = ch + m * input_chan[ll] if ll > 0 or not quad or (m % 4 == p // 16): for ie in range(in_expand[ll]): mask = this_mask def add_kernel_data(ll, p, col_target, b): col = kern_offs[ll] + col_target if col >= tc.dev.mask_width(p): eprint( f'\nKernel memory exceeded in layer {ll}.' '\n\nKernel map so far:') print_map(layers, kernel_map, print_fn=eprint_noprefix) sys.exit(1) if kernels_used[p][ col] == 0: # Update kernel map assert kernel_map[p][col] == _INVALID_VALUE kernel_map[p][col] = ll assert kernels_used[p][col] <= 8 kernel_data[p][col][ 8 - kernels_used[p][col]] = b & 0xff kernels_used[p][col] += 1 if kernels_used[p][col] == 9: # Flush col_target += 1 # Write 1 return col_target n = 0 if src_offs < len(kernel_reshaped): if not flatten[ll]: k = np.zeros_like( kernel_reshaped[src_offs].flatten()) for i in range(qfactor): if m < output_chan[ll]: # Cycle through phases idx = n + ie * qfactor koffs = src_offs + (idx % in_expand[ll]) \ * in_expand_thresh[ll] \ + (idx // in_expand[ll]) \ * input_chan[ll] if koffs < len(kernel_reshaped): this_kern = kernel_reshaped[koffs].flatten() \ & (2**quantization[ll]-1) k |= this_kern << ( i * quantization[ll]) n += 1 mask >>= 1 else: kl = (len(kernel_reshaped[src_offs]) + qfactor - 1) // qfactor k = np.zeros(kl, dtype=np.int64) if m < output_chan[ll]: # Cycle through phases idx = n + ie * qfactor koffs = src_offs + (idx % in_expand[ll]) \ * in_expand_thresh[ll] \ + (idx // in_expand[ll]) \ * input_chan[ll] if koffs < len(kernel_reshaped): this_kern = kernel_reshaped[ koffs].flatten() if len(this_kern) % qfactor != 0: this_kern = np.append( this_kern, np.zeros(qfactor - len(this_kern) % qfactor, dtype=np.int64)) for i in range(qfactor): k |= ((this_kern[i::qfactor] & (2**quantization[ll]-1))) \ << (i * quantization[ll]) n += 1 mask >>= 1 if debug: with np.printoptions( formatter={ 'int': '{0:02x}'.format }): print( f'Layer {ll} processor {p} channel ' f'{ch + ie * in_expand_thresh[ll]} m[{m}..{m+n-1}] ' f'of {output_chan[ll]}: {k}') if flatten[ll]: for _, e in enumerate(k): col_target = add_kernel_data( ll, p, col_target, e) else: for i in range(ksize): col_target = add_kernel_data( ll, p, col_target, k[ksize - i - 1]) else: # When expanding, need to pad with zero kernels if needed for _ in range(ksize // qfactor): col_target = add_kernel_data( ll, p, col_target, 0) # Consume kernels if not flatten[ll]: col += qfactor m += qfactor else: col += 1 m += 1 else: m += qfactor if kern_offs[ll] + col_target < tc.dev.mask_width(p) \ and kernels_used[p][kern_offs[ll] + col_target] > 0: # Partials col_target += 1 while col_target - start_col < kern_len[ll]: col_target = add_kernel_data(ll, p, col_target, 0) if flatten[ll]: kern_len[ll] = col_target else: assert kern_len[ll] == col_target - start_col proc_kern_max[p] = kern_offs[ll] + kern_len[ll] ch += 1 m = 0 if verbose: print('\nKernel map:') print_map(layers, kernel_map) if verify or not (embedded_code or mexpress): if verify: apb.output('int verify_kernels(void)\n{\n') # Write in-line for p in range(tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: k = kernel_data[p][col] apb.write_kern(ll, p, col, k, verify_only=verify, calcx4=calcx4) if verify: apb.output(' return 1;\n}\n\n') if embedded_code or mexpress: # Write kernels, combining layers and processors where possible to reduce the number # of constants and calls to memcpy. apb.output('// Kernels:\n') if not mexpress: for p in range(tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: k = kernel_data[p][col] offs = _WORDS_PER_KERNEL * col kernel_values[p][offs] = k[0] & 0xff kernel_values[p][offs + 1] = (k[1] & 0xff) << 24 \ | (k[2] & 0xff) << 16 | (k[3] & 0xff) << 8 | k[4] & 0xff kernel_values[p][offs + 2] = (k[5] & 0xff) << 24 \ | (k[6] & 0xff) << 16 | (k[7] & 0xff) << 8 | k[8] & 0xff # First, define the weights (will move to header file) # Combining memcopy() requires stacked memories max_col = [-1] * tc.dev.MAX_PROC min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0 ] * tc.dev.MAX_PROC for p in range(0, tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col[p] = col min_col[p] = min(min_col[p], col) p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: start = p while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 # Combine multiple channels into one define k = None for i in range(start, p + 1): if k is None: k = kernel_values[i][min_col[i] * _WORDS_PER_KERNEL: (max_col[i] + 1) * _WORDS_PER_KERNEL] else: k = np.concatenate( (k, kernel_values[i] [min_col[i] * _WORDS_PER_KERNEL:(max_col[i] + 1) * _WORDS_PER_KERNEL])) apb.output_define(k, f'KERNELS_{start}', '0x%08x', 8) p += 1 # Second, initialize static const variables as source for memcpy p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: span = max_col[p] + 1 - min_col[p] start = p while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 span += max_col[p] + 1 - min_col[p] if riscv_flash: apb.output(rv.RISCV_FLASH) apb.output( f'static const uint32_t kernels_{start}[] = KERNELS_{start};\n' ) p += 1 apb.output('\n') # Generate code to load the weights using memcpy apb.output( 'void memcpy_96to128(uint32_t *dst, const uint32_t *src, int n)\n{\n' ) apb.output(' while (n-- > 0) {\n' ' *dst++ = *src++;\n' ' *dst++ = *src++;\n' ' *dst++ = *src++;\n' ' *dst++ = 0; // Execute write\n' ' }\n}\n\n') else: # When using the express loader, gather all consecutive kernels for each processor # and pack them. zero_kernel = np.array([0] * 9, dtype=np.uint8) k = None for p in range(tc.dev.MAX_PROC): # Find min/max from kernel_map max_col = -1 min_col = tc.dev.mask_width(p) if not legacy_kernels else 0 for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col = col min_col = min(min_col, col) if max_col >= 0: for col in range(min_col, max_col + 1): ll = kernel_map[p][col] if ll != _INVALID_VALUE: new_k = (kernel_data[p][col] & 0xff).astype( np.uint8) else: new_k = zero_kernel if k is None: k = new_k else: k = np.concatenate((k, new_k)) # Round up to multiple of 4 if len(k) % 4 != 0: k = np.concatenate((k, zero_kernel[:4 - len(k) % 4])) # '>u4' swaps endianness to what the hardware needs, `view` packs into 32-bit if not blocklevel: apb.output_define(k.view(dtype='>u4'), f'KERNELS_{p}', '0x%08x', 8) else: addr = tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \ + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16 apb.write(addr + min_col * 4 | 0x01, 0x01) kb = k.view(dtype=">u4") for _, e in enumerate(kb): apb.write(addr, e) addr += 4 if riscv_flash: apb.output(rv.RISCV_FLASH) apb.output( f'static const uint32_t kernels_{p}[] = KERNELS_{p};\n' ) k = None apb.output('\n') if not blocklevel: apb.output('void load_kernels(void)\n{\n') max_col = [-1] * tc.dev.MAX_PROC min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0 ] * tc.dev.MAX_PROC for p in range(0, tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col[p] = col min_col[p] = min(min_col[p], col) p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: span = max_col[p] + 1 - min_col[p] start = p addr = apb.apb_base + tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \ + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16 while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 span += max_col[p] + 1 - min_col[p] assert addr % 16 == 0 if not mexpress: apb.output(' memcpy_96to128((uint32_t *)' f' 0x{addr + min_col[start] * 16:08x},' f' kernels_{start}, {span});\n') else: apb.output( ' *((volatile uint8_t *)' f' 0x{addr + min_col[start] * 4 | 0x01:08x}) = 0x01; ' '// Set address\n') apb.output( f' memcpy32((uint32_t *) 0x{addr:08x}, ' f'kernels_{start}, {(span * 9 + 3) // 4});\n') p += 1 apb.output('}\n\n') return kern_offs, kern_len
def load( checkpoint_file, unused_arch, fc_layer, quantization, bias_quantization, output_shift, kernel_size, # this information available in onnx model operator, verbose=False, no_bias=None, ): """ Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is `True`, configure a single fully connected classification layer for software rather than hardware. `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84). This value is checked against the weight inputs. `bias_quantization` is a list of the expected bit widths for the layer weights (always 8 for AI84/AI85). In addition to returning weights anf biases, this function configures the network output channels and the number of layers. When `verbose` is set, display the shapes of the weights. """ model = onnx.load(checkpoint_file) print(f'Reading {checkpoint_file} to configure network weights...') layers = 0 num_conv_layers = len(quantization) no_bias = no_bias or [] weights = [] bias = [] fc_weights = [] fc_bias = [] weight_keys = [] bias_keys = [] output_channels = [] input_channels = [] param_count = 0 param_size = 0 error_exit = False quant = [] bias_quant = [] weight_min = [] weight_max = [] weight_size = [] bias_min = [] bias_max = [] bias_size = [] seq = 0 kernel_size_onnx = [] initializers = {t.name for t in model.graph.initializer} for _, node in enumerate(model.graph.node): if node.op_type == 'Conv' or node.op_type == 'Gemm': _inputs, _outputs = get_inouts(node) for _input in _inputs: w = process_channels(model, _input, initializers) if w is not None: if node.op_type == 'Gemm': # general matrix multiplication (FC layer) kernel_shape = [1, 1] kernel_size_onnx.append(kernel_shape) if layers >= num_conv_layers: continue if fc_layer: if _input == _inputs[1]: # weight assert w.min() >= -128 and w.max() <= 127 fc_weights.append(w) if len(_inputs) == 3: # have optional bias input if _input == _inputs[2]: # bias assert w.min() >= -128 and w.max() <= 127 fc_bias.append(w) elif _input == _inputs[1]: # add bias 'None' fc_bias.append( None) # during weight input processing if node.op_type == 'Conv': # (Conv layer) for a in node.attribute: if a.name == 'kernel_shape': kernel_size_onnx.append(a.ints) if len(w.shape) > 1: # not a bias quant.append(quantization[seq]) w_min, w_max = w.min(), w.max() # Determine quantization or make sure that what was given fits if quantization[seq] is not None: assert w_min >= -(2**(quantization[seq] - 1)), print(w_min) assert w_max < 2**(quantization[seq] - 1), print(w_max) else: if w_max > 0: w_max_m = int(w_max) else: w_max_m = int(abs(w_max)) - 1 if w_min > 0: w_min_m = int(w_min) else: w_min_m = int(abs(w_min)) - 1 quantization[seq] = 1 << ( fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1) assert quantization[seq] <= 8 weight_min.append(w_min) weight_max.append(w_max) # Not overriding output_shift? if output_shift[seq] is None: output_shift[seq] = 0 # Add based on quantization output_shift[seq] += 8 - quantization[seq] # TODO: Double check if we need to check conv2d if opn is known # to be opn.CONVTRANSPOSE2D. We should be able to get this # from the op_type Conv plus shape? if operator[seq] == opn.CONVTRANSPOSE2D: # For ConvTranspose2d, flip the weights as follows: w = np.flip(w, axis=(2, 3)).swapaxes(0, 1) input_channels.append(w.shape[1]) # Input channels output_channels.append(w.shape[0]) # Output channels if len(w.shape) == 2: # MLP if kernel_size_onnx[seq][ 0] != 1 or kernel_size_onnx[seq][1] != 1: eprint( f'The `kernel_size` for the MLP layer {seq} should ' f'be set to 1x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True elif len(w.shape) == 3: # 1D if kernel_size_onnx[seq][0] != w.shape[2] \ or kernel_size_onnx[seq][1] != 1: eprint( f'The `kernel_size` for the 1D layer {seq} should ' f'be set to {w.shape[2]}x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True elif len(w.shape) == 4: # 2D if kernel_size_onnx[seq][0] != w.shape[2] \ or kernel_size_onnx[seq][1] != w.shape[3]: eprint( f'The `kernel_size` for the 2D layer {seq} should ' f'be set to {w.shape[2]}x{w.shape[3]} instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * quantization[seq] + 7) // 8 weight_size.append(w_size) param_size += w_size if len(w.shape) == 2: # linear - add dummy 'channel' w = np.expand_dims(w, axis=0) else: # conv1d, conv2d, ... - combine input and output channels w = np.reshape(w, (-1, ) + w.shape[2:]) weights.append(w) weight_keys.append(_input) if len(_inputs) < 3 or \ (_input == _inputs[2] and seq in no_bias): # no bias input bias.append(None) bias_min.append(0) bias_max.append(0) bias_keys.append('N/A') bias_quant.append(0) bias_size.append(0) elif _input == _inputs[2]: # bias input w = w // tornadocnn.dev.BIAS_DIV w_min, w_max = w.min(), w.max() assert w_min >= -(2**(bias_quantization[seq] - 1)) assert w_max < 2**(bias_quantization[seq] - 1) bias_min.append(w_min) bias_max.append(w_max) bias.append(w) bias_keys.append(_input) bias_quant.append(bias_quantization[seq]) w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * 8 + (bias_quantization[seq] - 1)) // bias_quantization[seq] bias_size.append(w_size) param_size += w_size seq += 1 layers += 1 # TODO: Things to add # if attribute.name == 'pads': # if attribute.name == 'strides': if verbose: print( 'Layer InCh OutCh Weights Quant Min Max Size ' 'Key Bias Quant Min Max Size Key' ) for ll in range(layers): if ll < len(weights) and weights[ll] is not None: weight_shape = str(weights[ll].shape) if bias[ll] is not None: bias_shape = str(bias[ll].shape) else: bias_shape = 'N/A' print( f'{ll:4}: ' f'{input_channels[ll]:5} {output_channels[ll]:5} ' f'{weight_shape:15} ' f'{quant[ll]:5} {weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} ' f'{weight_keys[ll]:35} ' f'{bias_shape:10} ' f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} ' f'{bias_keys[ll]:25}') print( f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes' ) if error_exit: sys.exit(1) if verbose: with np.printoptions(threshold=np.inf, linewidth=80): print("\nSUMMARY\n=======") print(layers, "layers\n") print("weights:") print(weights) print("bias:") print(bias) print("fc_weights:") print(fc_weights) print("fc_bias:") print(fc_bias) print("input_channels:") print(input_channels) print("output_channels:") print(output_channels) print("") return layers, weights, bias, output_shift, \ fc_weights, fc_bias, input_channels, output_channels