示例#1
0
def load(
    checkpoint_file,
    arch,
    fc_layer,
    quantization,
    bias_quantization,
    output_shift,
    kernel_size,
    operator,
    verbose=False,
    no_bias=None,
    conv_groups=None,
):
    """
    Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match
    the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is
    `True`, configure a single fully connected classification layer for software rather than
    hardware.
    `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84).
    This value is checked against the weight inputs.
    `bias_quantization` is a list of the expected bit widths for the layer weights (always
    8 for AI84/AI85).
    In addition to returning weights anf biases, this function configures the network output
    channels and the number of layers.
    When `verbose` is set, display the shapes of the weights.
    """
    no_bias = no_bias or []
    weights = []
    bias = []
    fc_weights = []
    fc_bias = []
    weight_keys = []
    bias_keys = []
    quant = []
    bias_quant = []
    weight_min = []
    weight_max = []
    weight_size = []
    bias_min = []
    bias_max = []
    bias_size = []

    checkpoint = torch.load(checkpoint_file, map_location='cpu')
    print(f'Reading {checkpoint_file} to configure network weights...')

    if 'state_dict' not in checkpoint or 'arch' not in checkpoint:
        raise RuntimeError("\nNo `state_dict` or `arch` in checkpoint file.")

    if arch and checkpoint['arch'].lower() != arch.lower():
        eprint(
            f"Network architecture of configuration file ({arch}) does not match "
            f"network architecture of checkpoint file ({checkpoint['arch']}).")
        sys.exit(1)

    checkpoint_state = checkpoint['state_dict']
    layers = 0
    num_conv_layers = len(quantization)
    have_fc_layer = False
    output_channels = []
    input_channels = []
    param_count = 0
    param_size = 0
    error_exit = False
    seq = 0

    for _, k in enumerate(checkpoint_state.keys()):
        # Skip over non-weight layers
        while seq < len(operator) and operator[seq] == opn.NONE:
            seq += 1

        operation, parameter = k.rsplit(sep='.', maxsplit=1)
        if parameter in ['weight']:
            module, op = k.split(sep='.', maxsplit=1)
            op = op.rsplit(sep='.', maxsplit=1)[0]
            if module != 'fc' or module == 'fc' and not fc_layer:
                if layers >= num_conv_layers or seq >= num_conv_layers:
                    continue

                w = checkpoint_state[k].numpy().astype(np.int64)
                w_min, w_max = w.min(), w.max()

                # Determine quantization or make sure that what was given fits
                if quantization[seq] is not None:
                    assert w_min >= -(2**(quantization[seq] - 1))
                    assert w_max < 2**(quantization[seq] - 1)
                else:
                    if w_max > 0:
                        w_max_m = int(w_max)
                    else:
                        w_max_m = int(abs(w_max)) - 1
                    if w_min > 0:
                        w_min_m = int(w_min)
                    else:
                        w_min_m = int(abs(w_min)) - 1
                    quantization[seq] = 1 << (
                        fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1)
                    assert quantization[seq] <= 8
                quant.append(quantization[seq])

                weight_min.append(w_min)
                weight_max.append(w_max)

                if op == 'conv2d' and operator[seq] == opn.CONVTRANSPOSE2D:
                    # For ConvTranspose2d, flip the weights as follows:
                    w = np.flip(w, axis=(2, 3)).swapaxes(0, 1)

                mult = conv_groups[
                    seq] if operator[seq] != opn.CONVTRANSPOSE2D else 1
                input_channels.append(w.shape[1] * mult)  # Input channels
                mult = conv_groups[seq] if operator[
                    seq] == opn.CONVTRANSPOSE2D else 1
                output_channels.append(w.shape[0] * mult)  # Output channels

                if len(w.shape) == 2:  # MLP
                    if kernel_size[seq][0] != 1 or kernel_size[seq][1] != 1:
                        eprint(
                            f'The `kernel_size` for the MLP layer {seq} should '
                            f'be set to 1x1 instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True
                elif len(w.shape) == 3:  # 1D
                    if kernel_size[seq][0] != w.shape[2] or kernel_size[seq][
                            1] != 1:
                        eprint(
                            f'The `kernel_size` for the 1D layer {seq} should '
                            f'be set to {w.shape[2]}x1 instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True
                elif len(w.shape) == 4:  # 2D
                    if kernel_size[seq][0] != w.shape[2] \
                       or kernel_size[seq][1] != w.shape[3]:
                        eprint(
                            f'The `kernel_size` for the 2D layer {seq} should '
                            f'be set to {w.shape[2]}x{w.shape[3]} instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True

                w_count = np.prod(w.shape)
                param_count += w_count
                w_size = (w_count * quantization[seq] + 7) // 8
                weight_size.append(w_size)
                param_size += w_size

                if len(w.shape) == 2:  # linear - add dummy 'channel'
                    w = np.expand_dims(w, axis=0)
                else:  # conv1d, conv2d, ... - combine input and output channels
                    w = np.reshape(w, (-1, ) + w.shape[2:])

                weights.append(w)
                weight_keys.append(k)

                # Is there a bias for this layer?
                bias_name = operation + '.bias'

                if bias_name in checkpoint_state and seq not in no_bias:
                    w = checkpoint_state[bias_name].numpy(). \
                        astype(np.int64) // tornadocnn.dev.BIAS_DIV

                    w_min, w_max = w.min(), w.max()
                    assert w_min >= -(2**(bias_quantization[seq] - 1))
                    assert w_max < 2**(bias_quantization[seq] - 1)

                    bias_min.append(w_min)
                    bias_max.append(w_max)

                    bias.append(w)
                    bias_keys.append(bias_name)
                    bias_quant.append(bias_quantization[seq])
                    w_count = np.prod(w.shape)
                    param_count += w_count
                    w_size = (
                        w_count * 8 +
                        (bias_quantization[seq] - 1)) // bias_quantization[seq]
                    bias_size.append(w_size)
                    param_size += w_size
                else:
                    bias.append(None)
                    bias_min.append(0)
                    bias_max.append(0)
                    bias_keys.append('N/A')
                    bias_quant.append(0)
                    bias_size.append(0)

                # Not overriding output_shift?
                if output_shift[seq] is None:
                    output_shift_name = operation.rsplit(
                        sep='.', maxsplit=1)[0] + '.output_shift'
                    # Is there an output_shift for this layer?
                    if output_shift_name in checkpoint_state:
                        w = checkpoint_state[output_shift_name].numpy().astype(
                            np.int64)

                        assert len(w) == 1
                        output_shift[seq] = w[0]
                    else:
                        output_shift[seq] = 0

                # Add implicit shift based on quantization
                output_shift[seq] += 8 - quantization[seq]

                layers += 1
                seq += 1
            elif have_fc_layer:
                eprint(
                    'The network cannot have more than one fully connected software layer, '
                    'and it must be the output layer.')
                sys.exit(1)
            elif fc_layer:
                w = checkpoint_state[k].numpy().astype(np.int64)
                assert w.min() >= -128 and w.max() <= 127
                fc_weights.append(w)
                # Is there a bias for this layer?
                bias_name = operation + '.bias'
                if bias_name in checkpoint_state:
                    # Do not divide bias for FC
                    w = checkpoint_state[bias_name].numpy().astype(np.int64)
                    assert w.min() >= -128 and w.max() <= 127
                    fc_bias.append(w)
                else:
                    fc_bias.append(None)
                have_fc_layer = True

    if verbose:
        print(
            f'Checkpoint for epoch {checkpoint["epoch"]}, model {checkpoint["arch"]} - '
            'weight and bias data:')
        print(
            'Layer  InCh OutCh  Weights         Quant Shift  Min Max   Size '
            'Key                                 Bias       Quant  Min Max Size Key'
        )
        for ll in range(layers):
            if ll < len(weights) and weights[ll] is not None:
                weight_shape = str(weights[ll].shape)
                if bias[ll] is not None:
                    bias_shape = str(bias[ll].shape)
                else:
                    bias_shape = 'N/A'
                if output_shift[ll] is not None:
                    output_shift_shape = output_shift[ll]
                else:
                    output_shift_shape = 'N/A'
                print(
                    f'{ll:4}: '
                    f'{input_channels[ll]:5} {output_channels[ll]:5}  '
                    f'{weight_shape:15} '
                    f'{quant[ll]:5} {output_shift_shape:5} '
                    f'{weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} '
                    f'{weight_keys[ll]:35} '
                    f'{bias_shape:10} '
                    f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} '
                    f'{bias_keys[ll]:25}')
        print(
            f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes'
        )

    if error_exit:
        sys.exit(1)

    return layers, weights, bias, output_shift, \
        fc_weights, fc_bias, input_channels, output_channels
示例#2
0
def load(
    dataset,
    quantization,
    bias_quantization,  # pylint: disable=unused-argument
    output_shift,
    cfg_layers,
    cfg_weights=None,
    cfg_bias=None,
    no_bias=None,
):
    """
    Return sample weights.
    """
    no_bias = no_bias or []
    weights = []
    fc_weights = []
    fc_bias = []
    output_channels = []
    input_channels = []
    layers = 0

    dataset = dataset.lower()

    # Load weights saved using:
    #    w = np.random.randint(-128, 127, (2, 64, 64, 3, 3), dtype=np.int8)
    #    np.save(f'tests/{dataset}', w)

    w = []
    layers = 0
    if cfg_weights is None:
        fname = os.path.join('tests', f'weights_{dataset}.npy')
    else:
        fname = os.path.join('tests', f'{cfg_weights}.npy')
    with open(fname, mode='rb') as file:
        print(f'Reading weights from {fname}...')
        try:
            while True:
                w.append(np.load(file))
                layers += 1
        except ValueError:
            pass

    if layers == 1:  # If the weights file wasn't a list
        w = w[0]
        layers = w.shape[0]

    layers = min(layers, cfg_layers)

    bias = [None] * layers

    if cfg_bias is not None:
        ll = 0
        fname = os.path.join('tests', f'bias_{cfg_bias}.npy')
        with open(fname, mode='rb') as file:
            print(f'Reading bias from {fname}...')
            try:
                while ll < layers:
                    if ll not in no_bias:
                        bias[ll] = np.load(file)
                    ll += 1
            except ValueError:
                pass

    for ll in range(layers):
        # Set to default?
        if quantization[ll] is None:
            quantization[ll] = 8

        # Re-quantize if needed (these random sample weights, so no need to round etc.)
        max_w = int(w[ll].max())
        if max_w < 0:
            max_w += 1
        min_w = int(w[ll].min())
        if min_w < 0:
            min_w += 1
        current_quant = max(fls(abs(min_w)), fls(abs(max_w))) + 2
        if current_quant > 8:  # Either way, more than 8 bits is an error
            raise ValueError(
                'ERROR: Weight file includes values larger than 8 bit!')
        if current_quant > quantization[ll]:
            w[ll] >>= current_quant - quantization[ll]

        # Specified output_shift?
        if output_shift[ll] is None:
            output_shift[ll] = 0
        # Add based on quantization
        output_shift[ll] += 8 - quantization[ll]

        output_channels.append(w[ll].shape[0])  # Output channels
        input_channels.append(w[ll].shape[1])  # Input channels
        if len(w[ll].shape) == 4:
            weights.append(w[ll].reshape(-1, w[ll].shape[-2], w[ll].shape[-1]))
        else:
            weights.append(w[ll].reshape(-1, w[ll].shape[-1]))

    return layers, weights, bias, output_shift, \
        fc_weights, fc_bias, input_channels, output_channels
示例#3
0
def load(  # pylint: disable=too-many-branches,too-many-statements
    verbose,
    embedded_code,
    device,
    apb,
    start_layer,
    layers,
    operator,
    kernel,
    kernel_size,
    quantization,
    processor_map,
    output_processor_map,
    input_chan,
    output_chan,
    out_expand,
    out_expand_thresh,
    in_expand,
    in_expand_thresh,
    flatten=False,
    mexpress=False,
    verify=False,
    riscv_flash=False,
    quad=False,
    debug=False,
    blocklevel=False,
    legacy_kernels=False,
    calcx4=False,
):
    """
    Stack `kernel` values and write them to C code (for `embedded_code` if `True` or
    RTL simulation). The output is written to the `apb` object.
    Input is configured with `kernel_size`, `quantization`, `layers`, `processor_map`,
    `output_processor_map`, `input_chan`, `output_chan`, `out_expand` and `out_expand_thresh`.
    When `mexpress` is `True`, the function uses the memcpy()-friendly hardware functionality to
    reduce the number of transfers. When `verify` is also true (mexpress mode only), kernels are
    read back and compared.
    This function returns the kernel offsets and the kernel lengths for all layers.
    """
    # Kernels: Stack kernels; write only the kernels needed
    proc_kern_max = [0] * tc.dev.MAX_PROC
    kern_offs = [0] * layers
    kern_len = [0] * layers
    kernel_map = np.full((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE),
                         _INVALID_VALUE,
                         dtype=np.int64)
    kernels_used = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE),
                            dtype=np.int64)
    kernel_data = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE, 9),
                           dtype=np.int8)
    # There are four 32-bit words per 9-byte kernel.
    # The value map is initialized with zeros so we can later ignore unused entries and use
    # memcpy() on initialized and uninitialized data.
    kernel_values = np.zeros(
        (tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE * _WORDS_PER_KERNEL),
        dtype=np.int64)
    if debug:
        print('\nLoading Kernels...')

    if calcx4 and not tc.dev.SUPPORT_CALCX4:
        eprint('--calcx4 is not supported on this device.')
        sys.exit(1)
    assert not (
        (embedded_code or mexpress) and calcx4)  # FIXME Add support later

    for ll in range(start_layer, layers):
        if operator[ll] not in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]:
            kern_len[ll] = 0
            kern_offs[ll] = 0
            continue

        if flatten[ll]:
            kernel_reshaped = kernel[ll].reshape(
                output_chan[ll] * input_chan[ll],
                -1,
                kernel_size[ll][0],
                kernel_size[ll][1],
            )
        else:
            kernel_reshaped = kernel[ll]

        first_proc = ffs(processor_map[ll])
        last_proc = fls(processor_map[ll])
        ch = 0
        m = 0
        for p in range(first_proc, last_proc + 1):
            if (processor_map[ll] >> p) & 1 == 0:
                # Unused processor
                continue
            # Get highest offset for all used processors
            kern_offs[ll] = max(proc_kern_max[p], kern_offs[ll])

        ksize = kernel_size[ll][0] * kernel_size[ll][1]
        qfactor = 8 // quantization[ll]
        # Determine the number of kernels that need to be programmed. Since each instance
        # spans 4 processors, kernels for all instances that have a single processor enabled
        # need to be written, i.e. round down the first. The last does not need to be rounded
        # up because hardware takes care of it.
        next_layer_map = output_processor_map[ll]
        # When using kernels smaller than 8 bit, round up to the next 8-bit boundary
        # Gaps are accounted for like any other kernel.
        kern_len[ll] = 1 + quantization[ll] * \
            (fls(next_layer_map) - ffs(next_layer_map)) // 8
        # This extends the kernels to the right on AI85 for input and output expansion
        if output_chan[ll] > tc.dev.MAX_PROC:
            kern_len[ll] = (kern_len[ll] + tc.dev.P_SHARED -
                            1) & ~(tc.dev.P_SHARED - 1)
        kern_len[ll] *= out_expand[ll] * in_expand[ll]
        if not legacy_kernels and flatten[ll]:
            kern_len[ll] *= kernel_reshaped.shape[1]
            kern_len[ll] -= (out_expand[ll] * popcount(next_layer_map) - output_chan[ll]) \
                * kernel_reshaped.shape[1] * 8 // (ksize * quantization[ll])
        if device != 84:
            # Pack kernels when using 1D convolutions, or 1x1 kernels
            kern_len[ll] = (kern_len[ll] * ksize + 8) // 9
        if ll == 0 and quad:
            kern_len[0] = (kern_len[0] + 3) // 4

        # We don't have to use dummy columns if there's space available on the left
        kern_offs[ll] = \
            max(0, kern_offs[ll] - (((ffs(next_layer_map) % tc.dev.P_SHARED)
                                     + qfactor - 1) // qfactor))
        # The kernel offset needs to start at a multiple of 4.
        kern_offs[ll] = (kern_offs[ll] + tc.dev.P_SHARED -
                         1) & ~(tc.dev.P_SHARED - 1)
        if kern_offs[ll] + kern_len[ll] > tc.dev.mask_width(p):
            eprint(
                f'\nKernel memory exceeded at layer {ll}; offset: {kern_offs[ll]}, '
                f'needed: {kern_len[ll]}.'
                '\n\nKernel map so far:')
            print_map(layers, kernel_map, print_fn=eprint_noprefix)
            sys.exit(1)

        proc_mask = 2**qfactor - 1
        # Start at the first used instance
        this_map_init = next_layer_map >> ffs(next_layer_map)
        start_col = ffs(
            next_layer_map) % tc.dev.P_SHARED  # First target column

        for p in range(first_proc, last_proc + 1):
            if (processor_map[ll] >> p) & 1 == 0:
                # Unused source processor
                continue
            col_target = start_col
            for expand in range(out_expand[ll]):
                this_map = this_map_init
                if ll == 0 and quad:
                    col = expand * (out_expand_thresh[ll] + 3) // 4
                    stop_col = col + (out_expand_thresh[ll] + 3) // 4
                else:
                    col = expand * out_expand_thresh[ll]
                    stop_col = col + out_expand_thresh[ll]
                while col < stop_col:
                    # Skip over unused bits in the target processor map
                    # (unused means 1 bit for 8-bit weights, 2 for 4-bit weights, etc.)
                    if this_map != 0:
                        while this_map & proc_mask == 0:
                            assert this_map != 0
                            col_target += 1  # Completely skip
                            this_map >>= qfactor  # and slide forward
                    this_mask = this_map & proc_mask
                    this_map >>= qfactor

                    if ll == 0 and quad:
                        src_offs = ch + (m - p // 16) * input_chan[ll]
                    else:
                        src_offs = ch + m * input_chan[ll]
                    if ll > 0 or not quad or (m % 4 == p // 16):
                        for ie in range(in_expand[ll]):
                            mask = this_mask

                            def add_kernel_data(ll, p, col_target, b):
                                col = kern_offs[ll] + col_target
                                if col >= tc.dev.mask_width(p):
                                    eprint(
                                        f'\nKernel memory exceeded in layer {ll}.'
                                        '\n\nKernel map so far:')
                                    print_map(layers,
                                              kernel_map,
                                              print_fn=eprint_noprefix)
                                    sys.exit(1)

                                if kernels_used[p][
                                        col] == 0:  # Update kernel map
                                    assert kernel_map[p][col] == _INVALID_VALUE
                                    kernel_map[p][col] = ll

                                assert kernels_used[p][col] <= 8
                                kernel_data[p][col][
                                    8 - kernels_used[p][col]] = b & 0xff
                                kernels_used[p][col] += 1

                                if kernels_used[p][col] == 9:  # Flush
                                    col_target += 1  # Write 1

                                return col_target

                            n = 0
                            if src_offs < len(kernel_reshaped):
                                if not flatten[ll]:
                                    k = np.zeros_like(
                                        kernel_reshaped[src_offs].flatten())
                                    for i in range(qfactor):
                                        if m < output_chan[ll]:
                                            # Cycle through phases
                                            idx = n + ie * qfactor
                                            koffs = src_offs + (idx % in_expand[ll]) \
                                                * in_expand_thresh[ll] \
                                                + (idx // in_expand[ll]) \
                                                * input_chan[ll]
                                            if koffs < len(kernel_reshaped):
                                                this_kern = kernel_reshaped[koffs].flatten() \
                                                    & (2**quantization[ll]-1)
                                                k |= this_kern << (
                                                    i * quantization[ll])
                                            n += 1
                                        mask >>= 1
                                else:
                                    kl = (len(kernel_reshaped[src_offs]) +
                                          qfactor - 1) // qfactor
                                    k = np.zeros(kl, dtype=np.int64)
                                    if m < output_chan[ll]:
                                        # Cycle through phases
                                        idx = n + ie * qfactor
                                        koffs = src_offs + (idx % in_expand[ll]) \
                                            * in_expand_thresh[ll] \
                                            + (idx // in_expand[ll]) \
                                            * input_chan[ll]
                                        if koffs < len(kernel_reshaped):
                                            this_kern = kernel_reshaped[
                                                koffs].flatten()
                                            if len(this_kern) % qfactor != 0:
                                                this_kern = np.append(
                                                    this_kern,
                                                    np.zeros(qfactor -
                                                             len(this_kern) %
                                                             qfactor,
                                                             dtype=np.int64))
                                            for i in range(qfactor):
                                                k |= ((this_kern[i::qfactor]
                                                       & (2**quantization[ll]-1))) \
                                                    << (i * quantization[ll])
                                        n += 1
                                        mask >>= 1
                                if debug:
                                    with np.printoptions(
                                            formatter={
                                                'int': '{0:02x}'.format
                                            }):
                                        print(
                                            f'Layer {ll} processor {p} channel '
                                            f'{ch + ie * in_expand_thresh[ll]} m[{m}..{m+n-1}] '
                                            f'of {output_chan[ll]}: {k}')

                                if flatten[ll]:
                                    for _, e in enumerate(k):
                                        col_target = add_kernel_data(
                                            ll, p, col_target, e)
                                else:
                                    for i in range(ksize):
                                        col_target = add_kernel_data(
                                            ll, p, col_target,
                                            k[ksize - i - 1])

                            else:  # When expanding, need to pad with zero kernels if needed
                                for _ in range(ksize // qfactor):
                                    col_target = add_kernel_data(
                                        ll, p, col_target, 0)

                        # Consume kernels
                        if not flatten[ll]:
                            col += qfactor
                            m += qfactor
                        else:
                            col += 1
                            m += 1
                    else:
                        m += qfactor

            if kern_offs[ll] + col_target < tc.dev.mask_width(p) \
               and kernels_used[p][kern_offs[ll] + col_target] > 0:  # Partials
                col_target += 1
            while col_target - start_col < kern_len[ll]:
                col_target = add_kernel_data(ll, p, col_target, 0)
            if flatten[ll]:
                kern_len[ll] = col_target
            else:
                assert kern_len[ll] == col_target - start_col
            proc_kern_max[p] = kern_offs[ll] + kern_len[ll]
            ch += 1
            m = 0

    if verbose:
        print('\nKernel map:')
        print_map(layers, kernel_map)

    if verify or not (embedded_code or mexpress):
        if verify:
            apb.output('int verify_kernels(void)\n{\n')
        # Write in-line
        for p in range(tc.dev.MAX_PROC):
            for col in range(0, tc.dev.mask_width(p)):
                ll = kernel_map[p][col]
                if ll != _INVALID_VALUE:
                    k = kernel_data[p][col]
                    apb.write_kern(ll,
                                   p,
                                   col,
                                   k,
                                   verify_only=verify,
                                   calcx4=calcx4)
        if verify:
            apb.output('  return 1;\n}\n\n')
    if embedded_code or mexpress:
        # Write kernels, combining layers and processors where possible to reduce the number
        # of constants and calls to memcpy.
        apb.output('// Kernels:\n')

        if not mexpress:
            for p in range(tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        k = kernel_data[p][col]
                        offs = _WORDS_PER_KERNEL * col
                        kernel_values[p][offs] = k[0] & 0xff
                        kernel_values[p][offs + 1] = (k[1] & 0xff) << 24 \
                            | (k[2] & 0xff) << 16 | (k[3] & 0xff) << 8 | k[4] & 0xff
                        kernel_values[p][offs + 2] = (k[5] & 0xff) << 24 \
                            | (k[6] & 0xff) << 16 | (k[7] & 0xff) << 8 | k[8] & 0xff

            # First, define the weights (will move to header file)
            # Combining memcopy() requires stacked memories
            max_col = [-1] * tc.dev.MAX_PROC
            min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0
                       ] * tc.dev.MAX_PROC
            for p in range(0, tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col[p] = col
                        min_col[p] = min(min_col[p], col)
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    start = p
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                    # Combine multiple channels into one define
                    k = None
                    for i in range(start, p + 1):
                        if k is None:
                            k = kernel_values[i][min_col[i] *
                                                 _WORDS_PER_KERNEL:
                                                 (max_col[i] + 1) *
                                                 _WORDS_PER_KERNEL]
                        else:
                            k = np.concatenate(
                                (k, kernel_values[i]
                                 [min_col[i] *
                                  _WORDS_PER_KERNEL:(max_col[i] + 1) *
                                  _WORDS_PER_KERNEL]))

                    apb.output_define(k, f'KERNELS_{start}', '0x%08x', 8)
                p += 1

            # Second, initialize static const variables as source for memcpy
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    span = max_col[p] + 1 - min_col[p]
                    start = p
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                        span += max_col[p] + 1 - min_col[p]
                    if riscv_flash:
                        apb.output(rv.RISCV_FLASH)
                    apb.output(
                        f'static const uint32_t kernels_{start}[] = KERNELS_{start};\n'
                    )
                p += 1
            apb.output('\n')

            # Generate code to load the weights using memcpy
            apb.output(
                'void memcpy_96to128(uint32_t *dst, const uint32_t *src, int n)\n{\n'
            )
            apb.output('  while (n-- > 0) {\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = 0;  // Execute write\n'
                       '  }\n}\n\n')
        else:
            # When using the express loader, gather all consecutive kernels for each processor
            # and pack them.
            zero_kernel = np.array([0] * 9, dtype=np.uint8)
            k = None

            for p in range(tc.dev.MAX_PROC):
                # Find min/max from kernel_map
                max_col = -1
                min_col = tc.dev.mask_width(p) if not legacy_kernels else 0
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col = col
                        min_col = min(min_col, col)
                if max_col >= 0:
                    for col in range(min_col, max_col + 1):
                        ll = kernel_map[p][col]
                        if ll != _INVALID_VALUE:
                            new_k = (kernel_data[p][col] & 0xff).astype(
                                np.uint8)
                        else:
                            new_k = zero_kernel
                        if k is None:
                            k = new_k
                        else:
                            k = np.concatenate((k, new_k))

                    # Round up to multiple of 4
                    if len(k) % 4 != 0:
                        k = np.concatenate((k, zero_kernel[:4 - len(k) % 4]))
                    # '>u4' swaps endianness to what the hardware needs, `view` packs into 32-bit
                    if not blocklevel:
                        apb.output_define(k.view(dtype='>u4'), f'KERNELS_{p}',
                                          '0x%08x', 8)
                    else:
                        addr = tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \
                            + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16
                        apb.write(addr + min_col * 4 | 0x01, 0x01)
                        kb = k.view(dtype=">u4")
                        for _, e in enumerate(kb):
                            apb.write(addr, e)
                            addr += 4

                    if riscv_flash:
                        apb.output(rv.RISCV_FLASH)
                    apb.output(
                        f'static const uint32_t kernels_{p}[] = KERNELS_{p};\n'
                    )
                    k = None
            apb.output('\n')

        if not blocklevel:
            apb.output('void load_kernels(void)\n{\n')
            max_col = [-1] * tc.dev.MAX_PROC
            min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0
                       ] * tc.dev.MAX_PROC
            for p in range(0, tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col[p] = col
                        min_col[p] = min(min_col[p], col)
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    span = max_col[p] + 1 - min_col[p]
                    start = p
                    addr = apb.apb_base + tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \
                        + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                        span += max_col[p] + 1 - min_col[p]
                    assert addr % 16 == 0
                    if not mexpress:
                        apb.output('  memcpy_96to128((uint32_t *)'
                                   f' 0x{addr + min_col[start] * 16:08x},'
                                   f' kernels_{start}, {span});\n')
                    else:
                        apb.output(
                            '  *((volatile uint8_t *)'
                            f' 0x{addr + min_col[start] * 4 | 0x01:08x}) = 0x01; '
                            '// Set address\n')
                        apb.output(
                            f'  memcpy32((uint32_t *) 0x{addr:08x}, '
                            f'kernels_{start}, {(span * 9 + 3) // 4});\n')
                p += 1

            apb.output('}\n\n')

    return kern_offs, kern_len
示例#4
0
def load(
    checkpoint_file,
    unused_arch,
    fc_layer,
    quantization,
    bias_quantization,
    output_shift,
    kernel_size,  # this information available in onnx model
    operator,
    verbose=False,
    no_bias=None,
):
    """
    Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match
    the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is
    `True`, configure a single fully connected classification layer for software rather than
    hardware.
    `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84).
    This value is checked against the weight inputs.
    `bias_quantization` is a list of the expected bit widths for the layer weights (always
    8 for AI84/AI85).
    In addition to returning weights anf biases, this function configures the network output
    channels and the number of layers.
    When `verbose` is set, display the shapes of the weights.
    """
    model = onnx.load(checkpoint_file)
    print(f'Reading {checkpoint_file} to configure network weights...')

    layers = 0
    num_conv_layers = len(quantization)
    no_bias = no_bias or []
    weights = []
    bias = []
    fc_weights = []
    fc_bias = []
    weight_keys = []
    bias_keys = []
    output_channels = []
    input_channels = []
    param_count = 0
    param_size = 0
    error_exit = False
    quant = []
    bias_quant = []
    weight_min = []
    weight_max = []
    weight_size = []
    bias_min = []
    bias_max = []
    bias_size = []
    seq = 0

    kernel_size_onnx = []

    initializers = {t.name for t in model.graph.initializer}
    for _, node in enumerate(model.graph.node):

        if node.op_type == 'Conv' or node.op_type == 'Gemm':
            _inputs, _outputs = get_inouts(node)
            for _input in _inputs:
                w = process_channels(model, _input, initializers)
                if w is not None:
                    if node.op_type == 'Gemm':  # general matrix multiplication (FC layer)
                        kernel_shape = [1, 1]
                        kernel_size_onnx.append(kernel_shape)
                        if layers >= num_conv_layers:
                            continue
                        if fc_layer:
                            if _input == _inputs[1]:  # weight
                                assert w.min() >= -128 and w.max() <= 127
                                fc_weights.append(w)

                            if len(_inputs) == 3:  # have optional bias input
                                if _input == _inputs[2]:  # bias
                                    assert w.min() >= -128 and w.max() <= 127
                                    fc_bias.append(w)
                            elif _input == _inputs[1]:  # add bias 'None'
                                fc_bias.append(
                                    None)  # during weight input processing

                    if node.op_type == 'Conv':  # (Conv layer)
                        for a in node.attribute:
                            if a.name == 'kernel_shape':
                                kernel_size_onnx.append(a.ints)

                    if len(w.shape) > 1:  # not a bias
                        quant.append(quantization[seq])

                        w_min, w_max = w.min(), w.max()

                        # Determine quantization or make sure that what was given fits
                        if quantization[seq] is not None:
                            assert w_min >= -(2**(quantization[seq] -
                                                  1)), print(w_min)
                            assert w_max < 2**(quantization[seq] -
                                               1), print(w_max)
                        else:
                            if w_max > 0:
                                w_max_m = int(w_max)
                            else:
                                w_max_m = int(abs(w_max)) - 1
                            if w_min > 0:
                                w_min_m = int(w_min)
                            else:
                                w_min_m = int(abs(w_min)) - 1
                            quantization[seq] = 1 << (
                                fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1)
                            assert quantization[seq] <= 8

                        weight_min.append(w_min)
                        weight_max.append(w_max)

                        # Not overriding output_shift?
                        if output_shift[seq] is None:
                            output_shift[seq] = 0
                        # Add based on quantization
                        output_shift[seq] += 8 - quantization[seq]

                        # TODO: Double check if we need to check conv2d if opn is known
                        # to be opn.CONVTRANSPOSE2D. We should be able to get this
                        # from the op_type Conv plus shape?
                        if operator[seq] == opn.CONVTRANSPOSE2D:
                            # For ConvTranspose2d, flip the weights as follows:
                            w = np.flip(w, axis=(2, 3)).swapaxes(0, 1)

                        input_channels.append(w.shape[1])  # Input channels
                        output_channels.append(w.shape[0])  # Output channels

                        if len(w.shape) == 2:  # MLP
                            if kernel_size_onnx[seq][
                                    0] != 1 or kernel_size_onnx[seq][1] != 1:
                                eprint(
                                    f'The `kernel_size` for the MLP layer {seq} should '
                                    f'be set to 1x1 instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True
                        elif len(w.shape) == 3:  # 1D
                            if kernel_size_onnx[seq][0] != w.shape[2] \
                               or kernel_size_onnx[seq][1] != 1:
                                eprint(
                                    f'The `kernel_size` for the 1D layer {seq} should '
                                    f'be set to {w.shape[2]}x1 instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True
                        elif len(w.shape) == 4:  # 2D
                            if kernel_size_onnx[seq][0] != w.shape[2] \
                               or kernel_size_onnx[seq][1] != w.shape[3]:
                                eprint(
                                    f'The `kernel_size` for the 2D layer {seq} should '
                                    f'be set to {w.shape[2]}x{w.shape[3]} instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True

                        w_count = np.prod(w.shape)
                        param_count += w_count
                        w_size = (w_count * quantization[seq] + 7) // 8
                        weight_size.append(w_size)
                        param_size += w_size

                        if len(w.shape) == 2:  # linear - add dummy 'channel'
                            w = np.expand_dims(w, axis=0)
                        else:  # conv1d, conv2d, ... - combine input and output channels
                            w = np.reshape(w, (-1, ) + w.shape[2:])

                        weights.append(w)
                        weight_keys.append(_input)

                    if len(_inputs) < 3 or \
                       (_input == _inputs[2] and seq in no_bias):  # no bias input
                        bias.append(None)
                        bias_min.append(0)
                        bias_max.append(0)
                        bias_keys.append('N/A')
                        bias_quant.append(0)
                        bias_size.append(0)
                    elif _input == _inputs[2]:  # bias input
                        w = w // tornadocnn.dev.BIAS_DIV
                        w_min, w_max = w.min(), w.max()
                        assert w_min >= -(2**(bias_quantization[seq] - 1))
                        assert w_max < 2**(bias_quantization[seq] - 1)
                        bias_min.append(w_min)
                        bias_max.append(w_max)

                        bias.append(w)
                        bias_keys.append(_input)
                        bias_quant.append(bias_quantization[seq])
                        w_count = np.prod(w.shape)
                        param_count += w_count
                        w_size = (w_count * 8 + (bias_quantization[seq] -
                                                 1)) // bias_quantization[seq]
                        bias_size.append(w_size)
                        param_size += w_size

            seq += 1
            layers += 1
        # TODO: Things to add
        # if attribute.name == 'pads':
        # if attribute.name == 'strides':

    if verbose:
        print(
            'Layer  InCh OutCh  Weights         Quant  Min Max   Size '
            'Key                                 Bias       Quant  Min Max Size Key'
        )
        for ll in range(layers):
            if ll < len(weights) and weights[ll] is not None:
                weight_shape = str(weights[ll].shape)
                if bias[ll] is not None:
                    bias_shape = str(bias[ll].shape)
                else:
                    bias_shape = 'N/A'
                print(
                    f'{ll:4}: '
                    f'{input_channels[ll]:5} {output_channels[ll]:5}  '
                    f'{weight_shape:15} '
                    f'{quant[ll]:5} {weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} '
                    f'{weight_keys[ll]:35} '
                    f'{bias_shape:10} '
                    f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} '
                    f'{bias_keys[ll]:25}')
        print(
            f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes'
        )

    if error_exit:
        sys.exit(1)

    if verbose:
        with np.printoptions(threshold=np.inf, linewidth=80):
            print("\nSUMMARY\n=======")
            print(layers, "layers\n")
            print("weights:")
            print(weights)
            print("bias:")
            print(bias)
            print("fc_weights:")
            print(fc_weights)
            print("fc_bias:")
            print(fc_bias)
            print("input_channels:")
            print(input_channels)
            print("output_channels:")
            print(output_channels)
            print("")

    return layers, weights, bias, output_shift, \
        fc_weights, fc_bias, input_channels, output_channels