def set_add_in_scale(qrec): scaled_idx = qrec.cache.get('scaled_idx') if scaled_idx is None: scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0) qrec.cache['scaled_idx'] = scaled_idx compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1) scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q') if scale_in_mul_biases_q is None: scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q not_scaled_idx = 0 if scaled_idx else 1 scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale scale_in_mul_biases_q.scale = scale if qrec.in_qs[0].asymmetric: # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb = # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc = # = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa)) # |---------- bias ----------| add_bias = qrec.out_qs[ 0].zero_point - qrec.cache['scale_mul_biases_q'].scale * ( qrec.in_qs[not_scaled_idx].zero_point + scale_in_mul_biases_q.scale * qrec.in_qs[scaled_idx].zero_point) else: add_bias = 0 qrec.cache['add_bias_offset'] = np.round(add_bias).astype(np.int16)
def _quantize(cls, params, in_qs, stats, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if the second input is constant # if len(in_qs) > 2: in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if in2_node: kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.int8, narrow_range=True, bits=8) else: in_q2 = in_qs[1].make_symmetric_signed() in_q1 = in_qs[0].make_symmetric_signed() min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 if o_q.dtype != np.int8 or o_q.asymmetric: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale) out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q)
def set_add_in_scale(qrec): scaled_idx = qrec.cache.get('scaled_idx') if scaled_idx is None: scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0) qrec.cache['scaled_idx'] = scaled_idx compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1) scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q') if scale_in_mul_biases_q is None: scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q not_scaled_idx = 0 if scaled_idx else 1 scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale scale_in_mul_biases_q.scale = scale
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] opts = kwargs['opts'] fusion = kwargs.get('fusion', None) G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.int8, narrow_range=opts['narrow_weights']) if fusion and fusion.fusion_type in [ 'conv_active_pool', 'conv_active' ]: stats = kwargs['all_stats'][NodeId(fusion, fusion.contained_nodes()[0])] if isinstance( fusion.contained_nodes()[1], (SigmoidActivationParameters, TanHActivationParameters, HSwishActivationParameters)): stats = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[0])] elif fusion and isinstance(fusion.contained_nodes()[1], HSigmoidActivationParameters): # Hard sigmoid implements a RELU, be sure 6 can be representable min_val, max_val = 0, 6 else: # Take stats from activation after the convolution stats = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[1])] if min_val is None or max_val is None: min_val, max_val = stats['range_out'][0]['min'], stats[ 'range_out'][0]['max'] if force_out_q: o_q = force_out_q else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) mul_biases_q = MultMulBiasScaleQType.from_filter( in_qs[0], weights_q, o_q, params) # returning the new weights and biases qs will force backprop # TODO - ACC_Q LOOKS WRONG AFTER THIS return MultScalableFilterQuantizationRecord( in_qs=[in_qs[0], weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q)
def compute_in_out_scale(qrec, in_idx=0, out_idx=0, extra_scale=1): if isinstance(in_idx, int): in_scale = qrec.in_qs[in_idx].scale else: in_scale = reduce(lambda x, y: x * y, [qrec.in_qs[idx].scale for idx in in_idx]) if isinstance(out_idx, int): out_scale = qrec.out_qs[out_idx].scale else: out_scale = reduce(lambda x, y: x * y, [qrec.out_qs[idx].scale for idx in out_idx]) scale_mul_biases_q = qrec.cache.get('scale_mul_biases_q') if scale_mul_biases_q is None: scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q scale = in_scale * extra_scale / out_scale scale_mul_biases_q.scale = scale
def set_ssd_scales(qrec, params): offset_q = qrec.in_qs[0] anchors_q = qrec.in_qs[2] out_boxes_q = qrec.out_qs[0] for k in [ 'scale_x_q', 'scale_x_anc_q', 'scale_y_q', 'scale_y_anc_q', 'scale_h_q', 'scale_w_q', 'scale_ao_q' ]: if k not in qrec.cache: qrec.cache[k] = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_x_q'].scale = (offset_q.scale * anchors_q.scale) / \ (out_boxes_q.scale * params.x_scale) qrec.cache['scale_x_anc_q'].scale = params.x_scale / offset_q.scale qrec.cache['scale_y_q'].scale = (offset_q.scale * anchors_q.scale) / \ (out_boxes_q.scale * params.y_scale) qrec.cache['scale_y_anc_q'].scale = params.y_scale / offset_q.scale qrec.cache['scale_h_q'].scale = offset_q.scale / params.h_scale qrec.cache['scale_w_q'].scale = offset_q.scale / params.w_scale qrec.cache['scale_ao_q'].scale = anchors_q.scale * \ 2**(-15) / out_boxes_q.scale
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = deepcopy(in_qs) # qrecs = kwargs['qrecs'] G = kwargs['G'] o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) # input_nodes = {GRUParameters.INPUT_NAMES[edge.to_idx]: edge.from_node # for edge in G.in_edges(params.name) # if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit # if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): # LOG.info( # "node %s has similar input and i_state scales --> " # "will be generated the same_scale kernel with better performance", params.name) # params.rnn_same_inout_scale = True # G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale, in_qs[names['r_2_z_w']].scale) wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale, in_qs[names['r_2_r_w']].scale) wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale, in_qs[names['r_2_h_w']].scale) i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) in_scale = state_scale = in_q.scale else: wWz_scale = in_qs[names['w_2_z_w']].scale wWr_scale = in_qs[names['w_2_r_w']].scale wWh_scale = in_qs[names['w_2_h_w']].scale rWz_scale = in_qs[names['r_2_z_w']].scale rWr_scale = in_qs[names['r_2_r_w']].scale rWh_scale = in_qs[names['r_2_h_w']].scale in_scale = in_qs[0].scale in_q = in_qs[0] state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) state_scale = state_q.scale i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) / (rWz_scale * state_scale)) i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) / (rWr_scale * state_scale)) i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) / (rWh_scale * state_scale)) i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32) h_WR_2_int_q = MultMulBiasScaleQType(scale=(rWh_scale * state_scale) / i_qtype.scale) r_WR_2_int_q = MultMulBiasScaleQType(scale=(rWr_scale * state_scale) / i_qtype.scale) z_WR_2_int_q = MultMulBiasScaleQType(scale=(rWz_scale * state_scale) / i_qtype.scale) if not params.rnn_states_as_inputs: in_qs[names['h_state']].scale = state_q.scale # cls.rescale_constant(input_nodes['h_state'], state_q.scale, qrecs) in_qs[0].scale = in_scale o_q.scale = state_scale in_qs[names['z_b']].scale = in_scale * rWz_scale in_qs[names['z_b']].dtype = BIAS_DTYPE # cls.rescale_constant(input_nodes['z_b'], in_scale * rWz_scale, qrecs, dtype=BIAS_DTYPE) in_qs[names['r_b']].scale = in_scale * rWr_scale in_qs[names['r_b']].dtype = BIAS_DTYPE # cls.rescale_constant(input_nodes['r_b'], in_scale * rWr_scale, qrecs, dtype=BIAS_DTYPE) in_qs[names['w_h_b']].scale = in_scale * wWh_scale in_qs[names['w_h_b']].dtype = BIAS_DTYPE # cls.rescale_constant(input_nodes['w_h_b'], in_scale * wWh_scale, qrecs, dtype=BIAS_DTYPE) in_qs[names['r_h_b']].scale = in_scale * rWh_scale in_qs[names['r_h_b']].dtype = BIAS_DTYPE # cls.rescale_constant(input_nodes['r_h_b'], state_scale * rWh_scale, qrecs, dtype=BIAS_DTYPE) in_qs[names['w_2_z_w']].scale = wWz_scale in_qs[names['w_2_z_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['w_2_z_w'], wWz_scale, qrecs, dtype=WEIGHTS_DTYPE) in_qs[names['w_2_r_w']].scale = wWr_scale in_qs[names['w_2_r_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['w_2_r_w'], wWr_scale, qrecs, dtype=WEIGHTS_DTYPE) in_qs[names['w_2_h_w']].scale = wWh_scale in_qs[names['w_2_h_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['w_2_h_w'], wWh_scale, qrecs, dtype=WEIGHTS_DTYPE) in_qs[names['r_2_z_w']].scale = rWz_scale in_qs[names['r_2_z_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['r_2_z_w'], rWz_scale, qrecs, dtype=WEIGHTS_DTYPE) in_qs[names['r_2_r_w']].scale = rWr_scale in_qs[names['r_2_r_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['r_2_r_w'], rWr_scale, qrecs, dtype=WEIGHTS_DTYPE) in_qs[names['r_2_h_w']].scale = rWh_scale in_qs[names['r_2_h_w']].dtype = WEIGHTS_DTYPE # cls.rescale_constant(input_nodes['r_2_h_w'], rWh_scale, qrecs, dtype=WEIGHTS_DTYPE) return MultScalableGRUQuantizationRecord(in_qs=in_qs, out_qs=[o_q], i_2_z_WR_q=i_2_z_WR_q, i_2_r_WR_q=i_2_r_WR_q, i_2_h_WR_q=i_2_h_WR_q, h_WR_2_int_q=h_WR_2_int_q, r_WR_2_int_q=r_WR_2_int_q, z_WR_2_int_q=z_WR_2_int_q, i_qtype=i_qtype, scales={ 'w_2_z_w': wWz_scale, 'w_2_r_w': wWr_scale, 'w_2_h_w': wWh_scale, 'r_2_z_w': rWz_scale, 'r_2_r_w': rWr_scale, 'r_2_h_w': rWh_scale, 'in': [in_scale], 'state': state_scale, 'out': [state_scale] })
def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8 force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if we have a bias in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if not in2_node: raise ValueError( f"Not supported in NE16 this matmul {params.name}") w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1] h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1] h2_padded = roundup(h2, input_bits == 16) kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.uint8, narrow_range=True, bit_pack=opts['weight_bits'], no_compression=True, bits=opts['weight_bits'], resize=((h2, w2), (h2_padded, w2))) in_q1 = QType.from_min_max_sq(in_qs[0].min_val, in_qs[0].max_val, dtype=input_dtype, asymmetric=True) in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False, opts['weight_bits']) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: force_output_size = opts.get('force_output_size', 8) out_dtype = np.uint8 if force_output_size == 8 else np.uint16 o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dont_copy_attr=['ne16'], asymmetric=True, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale, ne16_biases=(input_bits != 16)) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form bias_offset = np.zeros((in2_node.dqvalue.shape[0], ), dtype=np.int32) if in_q1.zero_point != 0: # input zero correction is sum(W * Zin) by out_c if weights are channel scaled bias_offset -= np.sum( np.multiply(in_q1.zero_point, in2_node.value_as(in_q2).astype(np.int32) - in_q2.zero_point, dtype=np.int32), dtype=np.int32, axis=1) if o_q.zero_point != 0: # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled scale = o_q.scale / (in_q1.scale * in_q2.scale) bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype( np.int32) if not np.all(bias_offset == 0): biases_q.offset = bias_offset out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale o_q.attr.ne16 = True if input_bits == 16: prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8) else: prenorm = 0 mul_biases_q.pre_normalization = prenorm return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q, ne16=True)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0) if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] opts = kwargs['opts'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) if force_out_qs and force_out_qs[0]: LOG.warning( 'on node %s output is being forced from scale %s -> %s', params.name, o_q.scale, force_out_qs[0].scale) o_q = force_out_qs[0] names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } cell_range = stats.get('range_cell') if cell_range is None: ValueError(f'cell range not present in stats for {params.name}') cell_stat = max(abs(cell_range[var]) for var in ['min', 'max']) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): LOG.warning( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) else: cell_max = cell_stat cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) int2_scale = int3_scale = out_tanh_sig_scale = None if params.hard_act: # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((16 - cell_int_bits), 10) int2_scale = math.pow(2, -(int_q * 2)) int3_scale = math.pow(2, -(int_q * 3)) else: int_q = 12 # output of LUT activations are always Q15 out_tanh_sig_scale = math.pow(2, -15) int_scale = math.pow(2, -int_q) scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } for weight_name in [ weight_name for scale_pair in scale_pairs.values() for weight_name in scale_pair ]: in_qs[names[weight_name]] = deepcopy(in_qs[names[weight_name]]) in_qs[names[weight_name]].dtype = np.int8 in_qs[names[weight_name]].bits = opts['weight_bits'] w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] if (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \ all([(abs(1 - w_scale[0] / w_scale[1]) < 0.2) for w_scale in w_scales]): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: if not (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \ not all([(abs(1 - w_scale[0] / w_scale[1]) < 0.1) for w_scale in w_scales]): LOG.warning( "node %s has different input and i_state scales consider using the " "LSTM kernel with rnn_same_inout_scale=False (better accuracy)", params.name) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) # i_state scale may be 1 since the value is 0 # np.maximum(in_and_out_scale, in_qs[names['i_state']].scale) i_state_scale = in_scale = in_and_out_scale in_qs[0].scale = in_scale o_q.scale = in_scale scales = { k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items() } for k, (namei, namer) in scale_pairs.items(): in_qs[names[namei]].scale = scales[k] in_qs[names[namer]].scale = scales[k] else: in_scale = in_qs[0].scale i_state_scale = o_q.scale o_q.scale = i_state_scale if not params.rnn_states_as_inputs: in_qs[names['i_state']].scale = i_state_scale # compute scales for perceptrons r_pscales = { k: in_qs[names["r_2_%s_w" % k]].scale * i_state_scale for k in ['i', 'o', 'c', 'f'] } scale_qtypes = { "r_2_%s_q" % k: MultMulBiasScaleQType(scale=r_pscale / int_scale) for k, r_pscale in r_pscales.items() } i_pscales = { k: in_qs[names["i_2_%s_w" % k]].scale * in_scale for k in ['i', 'o', 'c', 'f'] } # if input and i_state have different scales -> scale the inputs before sum # otherwise do nothing and these scales will be ignored scale_qtypes.update({ "i_2_%s_q" % k: MultMulBiasScaleQType(scale=i_pscale / r_pscale) for (k, i_pscale ), r_pscale in zip(i_pscales.items(), r_pscales.values()) }) if params.hard_act: cell_in_scale = in_qs[names['c_state']].scale / int_scale cell_out_scale = int2_scale / in_qs[names['c_state']].scale state_out_scale = int3_scale / i_state_scale else: cell_in_scale = in_qs[ names['c_state']].scale * out_tanh_sig_scale / int_scale cell_out_scale = int_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / i_state_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for gate in ['i', 'o', 'c', 'f']: in_qs[names[f"{gate}_b"]].scale = r_pscales[gate] in_qs[names[f"{gate}_b"]].dtype = np.int32 if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, **scale_qtypes, )
def _quantize_rnn(cls, params, in_qs, stats, input_bits, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) in_out_dtype = np.uint16 if input_bits == 16 else np.uint8 if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = deepcopy(in_qs) if in_qs is None: return None in_q = in_qs[0] opts = kwargs['opts'] # qrecs = kwargs['qrecs'] G = kwargs['G'] in_edges = G.indexed_in_edges(params.name) cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=in_out_dtype, narrow_range=opts['narrow_state']) names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} in_qs[names['i_state']] = o_q woff = {} int_num_inp = roundup(params.n_inputs, input_bits == 16) in_q = limit_input_precision( params, input_bits, in_q, int_num_inp, opts['narrow_weights'], opts['weight_bits']) woff['i_2_i_w'] = calculatate_weight_q( in_qs, in_edges, names['i_2_i_w'], in_q.zero_point[0], (params.n_states, params.n_inputs), (params.n_states, int_num_inp), opts['weight_bits'], opts['narrow_weights']) int_num_states = roundup(params.n_states, input_bits == 16) o_q = limit_input_precision( params, input_bits, o_q, int_num_states, opts['narrow_weights'], opts['weight_bits'], extra_correction=-1 if opts.get('narrow_state') else 0) woff['r_2_i_w'] = calculatate_weight_q( in_qs, in_edges, names['r_2_i_w'], o_q.zero_point[0], (params.n_states, params.n_states), (params.n_states, int_num_states), opts['weight_bits'], opts['narrow_weights']) i_state_scale = in_qs[names['i_state']].scale # rescale input * weight result to state * weight result so that they can be accumulated inp_before_scale = in_q.scale * in_qs[names['i_2_i_w']].scale state_w_scale = i_state_scale * in_qs[names['r_2_i_w']].scale # In 8 bit kernel input is rescaled to state scale # In 16 bit kernel input is scaled to LUT act input scale to avoid overflow # Bias zero correction is rescaled to state * state_w in both cases rescale = inp_before_scale / state_w_scale if input_bits == 8: i_2_s_q = MultMulBiasScaleQType( scale=rescale) # 8 bit mode biases are applied by NE16 so need to be multiplied by biases # and norm rounding added i_zp_b = woff['i_2_i_w'] * i_2_s_q.qbiases.astype( np.int32) + (1 << (i_2_s_q.qnorms.astype(np.int32) - 1)) woff = woff['r_2_i_w'] else: i_2_s_q = MultMulBiasScaleQType( scale=((in_q.scale * in_qs[names['i_2_i_w']].scale) / math.pow(2, -12))) i_2_s_q.pre_normalization = min( opts['weight_bits'], np.min(i_2_s_q.qnorms)) # in 16 bit mode biases are streamed in so zp corr already in right scale # and do not need norm rounding i_zp_b = woff['i_2_i_w'] woff = woff['r_2_i_w'] # hard activations are only implemented for 8 bit mode at present if input_bits == 8 and params.hard_act: act_input_scale = i_state_scale s_2_s_q = MultMulBiasScaleQType( scale=state_w_scale/i_state_scale) s_2_o_q = MultMulBiasScaleQType(scale=1.0) # will be ignored act_output_scale = i_state_scale act_qtype = QType(dtype=np.int8, scale=act_input_scale, narrow_range=opts.get('narrow_state')) else: act_input_scale = math.pow(2, -12) act_output_scale = math.pow(2, -15) act_qtype = None s_2_s_q = MultMulBiasScaleQType( scale=state_w_scale/act_input_scale) if input_bits == 16: s_2_s_q.pre_normalization = min( opts['weight_bits'], np.min(s_2_s_q.qnorms)) s_2_o_q = MultMulBiasScaleQType( scale=act_output_scale/o_q.scale) if input_bits == 8: in_qs[names['i_b']].scale = state_w_scale / s_2_s_q.qbiases in_qs[names['i_b']].dtype = np.int32 in_qs[names['i_b']].offset = woff * s_2_s_q.qbiases.astype( np.int32) + (1 << (s_2_s_q.qnorms.astype(np.int32) - 1)) if i_zp_b is not None: in_qs[names['i_b']].attr.interleaved_values = [i_zp_b] else: in_qs[names['i_b']].scale = state_w_scale in_qs[names['i_b']].dtype = np.int32 in_qs[names['i_b']].offset = woff # Interleave input zero offset bias with state bias at generation time in_qs[names['i_b']].attr.interleaved_values = [i_zp_b] return QRec.scaled( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, i_2_s_q=i_2_s_q, s_2_o_q=s_2_o_q, act_qtype=act_qtype, scales={ 'int_scale': act_output_scale, 'out_scale': o_q.scale, 'act_input_scale': act_input_scale, 'inp_after_scale': i_state_scale * in_qs[names['r_2_i_w']].scale, 'inp_before_scale': inp_before_scale }, ne16=True )
def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: if not cls.cache_values(node, qrec): return False in_q = qrec.in_qs[0] out_q = qrec.out_qs[0] comment = f'in q: {in_q} out_q: {out_q}' if qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP_ZEROPOINT': bits = 8 if in_q.dtype in [np.int8, np.uint8] else 16 if in_q.signed: offset = ((int(math.pow(2, bits)) + in_q.zero_point[0] - out_q.zero_point[0]) % int(math.pow(2, bits))).astype(out_q.dtype) else: offset = (int(math.pow(2, bits)) - in_q.zero_point[0] + out_q.zero_point[0]).astype(out_q.dtype) contents = np.array(list(offset.tobytes()) + ([0] * 7), dtype=np.uint8) elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP': # no infos needed return True elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP_SCALE': scale = in_q.scale / out_q.scale in_abs_zp = in_q.zero_point.astype(np.int32) out_abs_zp = out_q.zero_point.astype(np.int32) if out_q.bits > in_q.bits: zero_adjust = (np.round(-in_abs_zp * scale) + out_abs_zp).astype(np.int32) else: zero_adjust = (-in_abs_zp + np.round(out_abs_zp * 1 / scale)).astype( np.int32) zero_adjust = list(zero_adjust.tobytes()) if len(scale) > 1: raise NotImplementedError( 'multiscale conversion not supported') scale = scale[0] if in_q.dtype_bits == 8 and out_q.dtype_bits == 16: # scale Q16 * Q8 OK scale_adjust = MultMulBiasScaleQType(scale=scale, dtype=np.int16, available_bits=16) else: scale_adjust = MultMulBiasScaleQType(scale=scale, dtype=np.int8, available_bits=8) qbias = list(scale_adjust.qbiases.tobytes()) qbias = qbias + [0] * (2 - len(qbias)) qnorm = list(scale_adjust.qnorms.tobytes()) contents = np.array(zero_adjust + qbias + qnorm + [0], dtype=np.int8) elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FL_FP': qbias = list((1 / out_q.scale).astype(np.float32).tobytes()) zero_adjust = list((out_q.zero_point.astype(np.int32) * out_q.scale).astype(np.float32).tobytes()) contents = np.array(zero_adjust + qbias, dtype=np.int8) elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FL': qbias = list((in_q.scale).astype(np.float32).tobytes()) zero_adjust = list((-in_q.zero_point.astype(np.int32)).astype( np.float32).tobytes()) contents = np.array(zero_adjust + qbias, dtype=np.int8) else: raise ValueError(f"strange dtype change in {pnode.name}") cname, file_name = gen_constant(gen, pnode, pnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents) gen.globals.append( GlobalArgInfo("int8", cname, gen.opts['default_global_home_location'], gen.opts['default_global_exec_location'], const_info=const_info, comment=comment))
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = deepcopy(in_qs) in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0, dtype=np.int8) if in_qs is None: return None opts = kwargs['opts'] # qrecs = kwargs['qrecs'] G = kwargs['G'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=np.int8) names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options edges = G.indexed_in_edges(params.name) w_q = in_qs[names['i_2_i_w']] in_qs[names['i_2_i_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), dont_generate_value=True) w_q = in_qs[names['r_2_i_w']] in_qs[names['r_2_i_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), concatenated_nodes=[edges[names['i_2_i_w']].from_node.name]) w_scales = np.maximum( in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) if params.rnn_same_inout_scale: in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale if not params.rnn_states_as_inputs: in_qs[names['i_state']].scale = in_and_state_scale i_state_scale = in_and_state_scale i_2_a_q = MultMulBiasScaleQType(scale=1.0) # will be ignored else: i_state_scale = in_qs[names['i_state']].scale i_2_a_q = MultMulBiasScaleQType( scale=in_qs[0].scale/i_state_scale) in_qs[names['i_2_i_w']].scale = w_scales in_qs[names['r_2_i_w']].scale = w_scales state_w_scale = i_state_scale * w_scales in_qs[names['i_b']].scale = state_w_scale in_qs[names['i_b']].dtype = np.int32 if params.hard_act: s_2_s_q = MultMulBiasScaleQType( scale=state_w_scale/i_state_scale) s_2_o_q = MultMulBiasScaleQType(scale=1.0) # will be ignored act_output_scale = math.pow(2, -7) else: act_input_scale = math.pow(2, -12) act_output_scale = math.pow(2, -15) s_2_s_q = MultMulBiasScaleQType( scale=state_w_scale/act_input_scale) s_2_o_q = MultMulBiasScaleQType( scale=act_output_scale/o_q.scale) return QRec.scaled( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, i_2_a_q=i_2_a_q, s_2_o_q=s_2_o_q, scales={ 'int_scale': act_output_scale, 'out_scale': o_q.scale } )
def _quantize(cls, params, in_qs, stats, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) if cls.can_ne16(params, opts, fusion): LOG.info('selecting USQ8 NE16 kernel filter quantizer') return cls.quantize_ne16(params, in_qs, stats, **kwargs) LOG.info('selecting SQ8 software kernel filter quantizer') force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] in_q = in_qs[0] # check input quantization and int8 type # if not padded we can scale asymmetric if in_q.dtype == np.uint8: # handle NE16 cls.check_valid_ranges(params, stats, idx=0, dirs='in') # allow asymmetric if not padded if isinstance(params, Conv2DParameters) and params.padding.has_padding: in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, forced=True) else: in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, zero_point=in_q.zero_point - 128) elif (isinstance(params, Conv2DParameters) and not in_q.is_symmetric and params.padding.has_padding): cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8) # if not forced we can try asymmetric elif (opts['allow_asymmetric'] and isinstance(params, Conv2DParameters) and not in_q.forced and in_q.is_symmetric and not params.padding.has_padding): cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, asymmetric=True) if opts['weight_bits'] != 8: LOG.warning( 'sub byte weights quantization requested but NE16 kernel not selected' ) weights_node = cls.get_weights_node(G, fusion if fusion else params) weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.int8, narrow_range=opts['narrow_weights'], bits=opts['weight_bits']) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 if o_q.dtype != np.int8: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.is_asymmetric else "symmetric") else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype, asymmetric=opts['allow_asymmetric']) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # returning the new weights and biases qs will force backprop # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMult.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) if not (opts['allow_asymmetric'] or force_out_q or biases_q.offset is None): raise ValueError( f'bias offset is set but asymmetric is disallowed in {params.name}' ) # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) if isinstance(params, Conv2DParameters) and params.padding.has_padding: in_q.set_forced(flags=['zero_point']) cls.check_order(params, AT_SW_KER_IN_ORDER, AT_SW_KER_OUT_ORDER) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q)
def new_load_filter_parameters(cls, G, params, filter_shape, filter_scale_axis, input_tensor, weights_node, bias_node, output_tensor, opts, dw_to_pw=False): weights_node.meta['filter_params'] = True bias_node.meta['filter_params'] = True # if quantizaton is not loaded then the constants will already be dequantized if dw_to_pw: # Conv has been converted from depthwise to pointwise so reorder the weights tensor weights_node.value = np.transpose(weights_node.value, cls.TF_LITE_DW_FILTER_TRANSPOSE) weights_node.dims = Dim.unnamed(weights_node.value.shape) if not opts.get('load_quantization'): return wqtype = weights_node.qtype if wqtype is None: LOG.warning('quantization is missing on node %s', params.name) return # scale weights as requested. change asymmetric and/or unsigned weights to signed symmetric if wqtype.asymmetric or not wqtype.signed: if opts.get('rescale_perchannel'): wqtype = cls.get_weights_qtype_by_channel( filter_shape, filter_scale_axis, weights_node) else: wqtype = cls.get_weights_qtype_by_tensor(weights_node) else: if opts.get('rescale_perchannel'): if len(wqtype.scale) != filter_shape[filter_scale_axis]: wqtype = cls.get_weights_qtype_by_channel( filter_shape, filter_scale_axis, weights_node) else: if len(wqtype.scale) > 1: wqtype = cls.get_weights_qtype_by_tensor(weights_node) iqtype = input_tensor.qtype # correct input qtype to symmetric tensor scaled if iqtype.asymmetric or not iqtype.signed or len(iqtype.scale) > 1: iqtype = QType.from_min_max_sq(min_val=iqtype.min_val, max_val=iqtype.max_val) else: iqtype = deepcopy(iqtype) oqtype = output_tensor.qtype # correct output qtype to symmetric tensor scaled if oqtype.asymmetric or not oqtype.signed or len(oqtype.scale) > 1: oqtype = QType.from_min_max_sq(min_val=oqtype.min_val, max_val=oqtype.max_val) else: oqtype = deepcopy(oqtype) # dqbias = bias_node.dqvalue bias_scale = (iqtype.scale * wqtype.scale).astype(np.float32) bqtype = QType(dtype=np.int32, scale=bias_scale) # NOTE: In some tensorflow graphs the biases are hugely negative or hugely # positive. I've never seen this without a relun after and the weights on # these channels were 0. Actually they should be pruned. # don't overwrite the quantized values since we may move around quantization later # bias_node.value = bqtype.quantize(dqbias) # bias_node.qtype = bqtype if dw_to_pw and wqtype.quantized_dimension: wqtype.quantized_dimension = 0 mulbiases_q = MultMulBiasScaleQType.from_filter( iqtype, wqtype, oqtype, params) qrec = QRec.scaled(in_qs=[iqtype, wqtype, bqtype], out_qs=[oqtype], calc_q=bqtype, acc_q=bqtype, mul_biases_q=mulbiases_q) # now set the quantization records on the node and its constants G.quantization[NodeId(params)] = qrec G.quantization[NodeId(weights_node)] = QRec.scaled( out_qs=[deepcopy(wqtype)]) G.quantization[NodeId(bias_node)] = QRec.scaled( out_qs=[deepcopy(bqtype)])
def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8 opts = kwargs['opts'] fusion = kwargs.get('fusion', None) LOG.info('selecting USQ8 NE16 kernel filter quantizer') force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None weights_q = QType.from_array_sq(arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension( params, opts), dtype=np.uint8, narrow_range=True, bit_pack=opts['weight_bits'], no_compression=True, bits=opts['weight_bits']) in_q = in_qs[0] in_q = limit_input_precision( params, input_bits, in_q, params.filter.sz, opts['narrow_weights'], opts['weight_bits']) # input dtype is either uint8 or int8 if in_q.dtype != input_dtype: if in_q.forced_dtype: return None cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=input_dtype, asymmetric=False) min_val, max_val = cls.get_min_max( fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = deepcopy(force_out_q) o_q.dont_copy_attr = ['ne16'] LOG.warning('node %s output forced to range %s/%s - actual range %s/%s', params.name, o_q.min, o_q.max, min_val, max_val) else: force_output_size = opts.get('force_output_size', 8) output_dtype = np.uint8 if force_output_size == 8 else np.uint16 o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=output_dtype, dont_copy_attr=['ne16'], asymmetric=True) o_q.attr.ne16 = True biases_q = QType( dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=(input_bits!=16)) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMultNE16Base.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) # returning the new weights and biases qs will force backprop cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) if input_bits == 16: prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8) else: prenorm = 0 mul_biases_q.pre_normalization = prenorm # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q, ne16=True)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0) if in_qs is None: return None in_qs = deepcopy(in_qs) opts = kwargs['opts'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)} edges = kwargs['G'].indexed_in_edges(params.name) for gate in ['r', 'z', 'h']: w_q = in_qs[names[f'w_2_{gate}_w']] in_qs[names[f'w_2_{gate}_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), dont_generate_value=True) w_q = in_qs[names[f'r_2_{gate}_w']] in_qs[names[f'r_2_{gate}_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), concatenated_nodes=[ edges[names[f'w_2_{gate}_w']].from_node.name ]) if params.rnn_same_inout_scale: wWz_scale = rWz_scale = np.maximum(in_qs[names['w_2_z_w']].scale, in_qs[names['r_2_z_w']].scale) wWr_scale = rWr_scale = np.maximum(in_qs[names['w_2_r_w']].scale, in_qs[names['r_2_r_w']].scale) wWh_scale = rWh_scale = np.maximum(in_qs[names['w_2_h_w']].scale, in_qs[names['r_2_h_w']].scale) i_2_z_WR_q = i_2_r_WR_q = i_2_h_WR_q = None in_q = state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) in_scale = state_scale = in_q.scale else: wWz_scale = in_qs[names['w_2_z_w']].scale wWr_scale = in_qs[names['w_2_r_w']].scale wWh_scale = in_qs[names['w_2_h_w']].scale rWz_scale = in_qs[names['r_2_z_w']].scale rWr_scale = in_qs[names['r_2_r_w']].scale rWh_scale = in_qs[names['r_2_h_w']].scale in_scale = in_qs[0].scale in_q = in_qs[0] state_q = QType(bits=8, q=7, signed=True, dtype=np.int8) state_scale = state_q.scale i_2_z_WR_q = MultMulBiasScaleQType(scale=(wWz_scale * in_scale) / (rWz_scale * state_scale)) i_2_r_WR_q = MultMulBiasScaleQType(scale=(wWr_scale * in_scale) / (rWr_scale * state_scale)) i_2_h_WR_q = MultMulBiasScaleQType(scale=(wWh_scale * in_scale) / (rWh_scale * state_scale)) i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32) h_WR_2_int_q = MultMulBiasScaleQType(scale=(rWh_scale * state_scale) / i_qtype.scale) r_WR_2_int_q = MultMulBiasScaleQType(scale=(rWr_scale * state_scale) / i_qtype.scale) z_WR_2_int_q = MultMulBiasScaleQType(scale=(rWz_scale * state_scale) / i_qtype.scale) if not params.rnn_states_as_inputs: in_qs[names['h_state']].scale = state_q.scale in_qs[0].scale = in_scale o_q.scale = state_scale in_qs[names['z_b']].scale = in_scale * rWz_scale in_qs[names['z_b']].dtype = BIAS_DTYPE in_qs[names['r_b']].scale = in_scale * rWr_scale in_qs[names['r_b']].dtype = BIAS_DTYPE in_qs[names['w_h_b']].scale = in_scale * wWh_scale in_qs[names['w_h_b']].dtype = BIAS_DTYPE in_qs[names['r_h_b']].scale = in_scale * rWh_scale in_qs[names['r_h_b']].dtype = BIAS_DTYPE in_qs[names['w_2_z_w']].scale = wWz_scale in_qs[names['w_2_r_w']].scale = wWr_scale in_qs[names['w_2_h_w']].scale = wWh_scale in_qs[names['r_2_z_w']].scale = rWz_scale in_qs[names['r_2_r_w']].scale = rWr_scale in_qs[names['r_2_h_w']].scale = rWh_scale return QRec.scaled(in_qs=in_qs, out_qs=[o_q], i_2_z_WR_q=i_2_z_WR_q, i_2_r_WR_q=i_2_r_WR_q, i_2_h_WR_q=i_2_h_WR_q, h_WR_2_int_q=h_WR_2_int_q, r_WR_2_int_q=r_WR_2_int_q, z_WR_2_int_q=z_WR_2_int_q, i_qtype=i_qtype, scales={ 'w_2_z_w': wWz_scale, 'w_2_r_w': wWr_scale, 'w_2_h_w': wWh_scale, 'r_2_z_w': rWz_scale, 'r_2_r_w': rWr_scale, 'r_2_h_w': rWh_scale, 'in': [in_scale], 'state': state_scale, 'out': [state_scale] })
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = deepcopy(in_qs) in_qs = cls.force_symmetric_and_dtype(in_qs, dtype=np.int16, idx=0) if in_qs is None: return None opts = kwargs['opts'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)} edges = kwargs['G'].indexed_in_edges(params.name) for gate in ['r', 'z', 'h']: w_q = in_qs[names[f'w_2_{gate}_w']] in_qs[names[f'w_2_{gate}_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), dont_generate_value=True) w_q = in_qs[names[f'r_2_{gate}_w']] in_qs[names[f'r_2_{gate}_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), concatenated_nodes=[ edges[names[f'w_2_{gate}_w']].from_node.name ]) wWz_scale = in_qs[names['w_2_z_w']].scale wWr_scale = in_qs[names['w_2_r_w']].scale wWh_scale = in_qs[names['w_2_h_w']].scale rWz_scale = in_qs[names['r_2_z_w']].scale rWr_scale = in_qs[names['r_2_r_w']].scale rWh_scale = in_qs[names['r_2_h_w']].scale in_scale = in_qs[0].scale state_q_bits = 14 if opts.get('narrow_state', False) else 15 state_q = QType(bits=16, q=state_q_bits, signed=True, dtype=np.int16) state_scale = state_q.scale i_qtype = QType(bits=32, q=12, signed=True, dtype=np.int32) int_scale = i_qtype.scale act_qtype = QType(bits=32, q=15, signed=True, dtype=np.int32) input_z_w_internal = MultMulBiasScaleQType( scale=(wWz_scale * in_scale) / int_scale) input_r_w_internal = MultMulBiasScaleQType( scale=(wWr_scale * in_scale) / int_scale) input_h_w_internal = MultMulBiasScaleQType( scale=(wWh_scale * in_scale) / int_scale) state_h_w_internal = MultMulBiasScaleQType( scale=(rWh_scale * state_scale) / int_scale) state_r_w_internal = MultMulBiasScaleQType( scale=(rWr_scale * state_scale) / int_scale) state_z_w_internal = MultMulBiasScaleQType( scale=(rWz_scale * state_scale) / int_scale) in_qs[names['h_state']] = state_q o_q = state_q in_qs[names['z_b']].scale = int_scale in_qs[names['z_b']].dtype = BIAS_DTYPE in_qs[names['r_b']].scale = int_scale in_qs[names['r_b']].dtype = BIAS_DTYPE in_qs[names['w_h_b']].scale = in_scale * wWh_scale in_qs[names['w_h_b']].dtype = BIAS_DTYPE in_qs[names['r_h_b']].scale = state_scale * rWh_scale in_qs[names['r_h_b']].dtype = BIAS_DTYPE return QRec.scaled(in_qs=in_qs, out_qs=[o_q], input_z_w_internal=input_z_w_internal, input_r_w_internal=input_r_w_internal, input_h_w_internal=input_h_w_internal, state_h_w_internal=state_h_w_internal, state_r_w_internal=state_r_w_internal, state_z_w_internal=state_z_w_internal, i_qtype=i_qtype, act_qtype=act_qtype, scales={ 'w_2_z_w': wWz_scale, 'w_2_r_w': wWr_scale, 'w_2_h_w': wWh_scale, 'r_2_z_w': rWz_scale, 'r_2_r_w': rWr_scale, 'r_2_h_w': rWh_scale, 'in': [in_scale], 'state': state_scale, 'out': [state_scale], 'act': math.pow(2, -15) })
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0, dtype=np.int16) if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] opts = kwargs.get('opts', {}) cls.check_valid_ranges(params, stats, idx=0, dirs='out') names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } o_q = in_qs[names['i_state']] = QType.from_min_max_sq( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=np.int16) if force_out_q: if force_out_q.zero_point != 0: return None LOG.warning( 'on node %s output is being forced from scale %s -> %s', params.name, o_q.scale, force_out_qs[0].scale) o_q = force_out_qs[0] cell_range = stats.get('range_cell') if cell_range is None: raise ValueError( f'cell range not present in stats for {params.name}') # cell range in minimum 1.0 cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']]) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): msg = ( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) LOG.warning('%s', msg) else: cell_max = cell_stat # this limit is driven by the c_in * f + c * i calculation # c * i will be in Q24 and we want c_in * f to be scaled to the same # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6 cell_limit = pow(2, 6) if cell_max > cell_limit: LOG.warning('Cell state exceeds %s and will be clipped', cell_limit) cell_max = cell_limit cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']] = QType.from_min_max_sq(-cell_max, cell_max, dtype=np.int16) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # set weight qtypes edges = kwargs['G'].indexed_in_edges(params.name) scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } for scale_pair in scale_pairs.values(): in_q = in_qs[names[scale_pair[0]]] in_qs[names[scale_pair[0]]] = QType.from_min_max_sq( in_q.min_val, in_q.max_val, dtype=np.int8, narrow_range=opts.get('narrow_weights'), dont_generate_value=True) in_qs[names[scale_pair[0]]].bits = opts['weight_bits'] in_q = in_qs[names[scale_pair[1]]] in_qs[names[scale_pair[1]]] = QType.from_min_max_sq( in_q.min_val, in_q.max_val, dtype=np.int8, narrow_range=opts.get('narrow_weights'), concatenated_nodes=[ edges[names[scale_pair[0]]].from_node.name ]) in_qs[names[scale_pair[1]]].bits = opts['weight_bits'] # get weight scales w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] gate_sum_max = [(get_max(stats[f'range_{gate}_gate_i']), get_max(stats[f'range_{gate}_gate_r'])) for gate in ['i', 'o', 'c', 'f']] gate_sum_max_bits = [ (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))), np.ceil(np.log2(gsm_r / (o_q.scale * r_w)))) for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales) ] for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'], gate_sum_max_bits): if max_i > 30: LOG.warning( 'max bits in accumulation input %s gate %s - there may be errors', max_i, gate) if max_r > 30: LOG.warning( 'max bits in accumulation state %s gate %s - there may be errors', max_i, gate) # LUT activations Q12 -> Q15 act_in_q = 12 act_out_q = 15 int_scale = math.pow(2, -act_in_q) out_tanh_sig_scale = math.pow(2, -act_out_q) scale_qtypes = {} r_pscales = {} i_pscales = {} scale_qtypes['r_pscales'] = r_pscales scale_qtypes['i_pscales'] = i_pscales for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales, gate_sum_max_bits): weight_scale_ratio = w_scale[0] / w_scale[1] # TODO - decide to scale weights equal in_qs[names[f"{gate}_b"]] = QType(scale=int_scale, dtype=np.int32) i_pscales[gate] = w_scale[0] * in_qs[0].scale scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / int_scale) qscale.pre_normalization = int(max(8 - (31 - max_bits[0]), 0)) r_pscales[gate] = w_scale[1] * o_q.scale scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=r_pscales[gate] / int_scale) qscale.pre_normalization = int(max(8 - (31 - max_bits[1]), 0)) r_pscales['state_out_scale'] = o_q.scale r_pscales['int_scale'] = int_scale # ct = c_in * f + c * i # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12 # scale(c_in * f) = Q15 * Q15 prenorm 8 and scale -> Q12 # ((c_in * f) + (c * i)) in Q12 # scale -> cell_out # tan(ct) -> Q15 # o * tan(ct) -> Q30 # prenorm and scale # cell in to Q12 cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale / int_scale) # cell_out from Q12 cell_out_scale = int_scale / in_qs[names['c_state']].scale # state out from Q30 state_out_scale = math.pow(2, -(2 * act_out_q)) / o_q.scale r_pscales['act_out_scale'] = out_tanh_sig_scale r_pscales['c_before_scale'] = int_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # for 16 bit pre-normalize the scales to give us room scale_qtypes['cell_in_q'].pre_normalization = 8 scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) scale_qtypes['state_out_q'].pre_normalization = 8 scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32) if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, **scale_qtypes, )
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0) if in_qs is None: return None in_qs = deepcopy(in_qs) opts = kwargs['opts'] # qrecs = kwargs['qrecs'] G = kwargs['G'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) # input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node # for edge in G.in_edges(params.name) # if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options for weight_name in ['i_2_i_w', 'r_2_i_w']: in_qs[names[weight_name]] = deepcopy(in_qs[names[weight_name]]) in_qs[names[weight_name]].dtype = np.int8 in_qs[names[weight_name]].bits = opts['weight_bits'] w_scales = np.maximum(in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) if params.rnn_same_inout_scale: in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale if not params.rnn_states_as_inputs: in_qs[names['i_state']].scale = in_and_state_scale # cls.rescale_constant(input_nodes['i_state'], in_and_state_scale, qrecs) i_state_scale = in_and_state_scale i_2_a_q = MultMulBiasScaleQType(scale=1.0) # will be ignored else: i_state_scale = in_qs[names['i_state']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_qs[0].scale / i_state_scale) in_qs[names['i_2_i_w']].scale = w_scales # cls.rescale_constant(input_nodes['i_2_i_w'], w_scales, qrecs) in_qs[names['r_2_i_w']].scale = w_scales # cls.rescale_constant(input_nodes['r_2_i_w'], w_scales, qrecs) state_w_scale = i_state_scale * w_scales in_qs[names['i_b']].scale = state_w_scale in_qs[names['i_b']].dtype = np.int32 # cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32) if params.hard_act: s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale / i_state_scale) s_2_o_q = MultMulBiasScaleQType(scale=1.0) # will be ignored else: act_input_scale = math.pow(2, -12) act_output_scale = math.pow(2, -15) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale / act_input_scale) s_2_o_q = MultMulBiasScaleQType(scale=act_output_scale / o_q.scale) return QRec.scaled( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, i_2_a_q=i_2_a_q, s_2_o_q=s_2_o_q, )
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None in_qs = deepcopy(in_qs) in_qs = cls.force_symmetric_and_dtype(in_qs, dtype=np.int16, idx=0) if in_qs is None: return None opts = kwargs['opts'] # qrecs = kwargs['qrecs'] G = kwargs['G'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType(q=15, dtype=np.int16) names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} in_qs[names['i_state']] = o_q if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options edges = G.indexed_in_edges(params.name) w_q = in_qs[names['i_2_i_w']] in_qs[names['i_2_i_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), dont_generate_value=True) w_q = in_qs[names['r_2_i_w']] in_qs[names['r_2_i_w']] = QType.from_min_max_sq( w_q.min_val, w_q.max_val, dtype=np.int8, bits=opts['weight_bits'], narrow_range=opts.get('narrow_weights', True), concatenated_nodes=[edges[names['i_2_i_w']].from_node.name]) act_input_scale = math.pow(2, -12) i_2_a_q = MultMulBiasScaleQType( scale=in_qs[0].scale * in_qs[names['i_2_i_w']].scale/act_input_scale) in_qs[names['i_b']].scale = o_q.scale * in_qs[names['r_2_i_w']].scale in_qs[names['i_b']].dtype = np.int32 # cls.rescale_constant(input_nodes['i_b'], state_w_scale, qrecs, dtype=np.int32) act_output_scale = math.pow(2, -15) s_2_s_q = MultMulBiasScaleQType( scale=o_q.scale * in_qs[names['r_2_i_w']].scale/act_input_scale) return QRec.scaled( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, i_2_a_q=i_2_a_q, scales={ 'int_scale': act_output_scale, 'out_scale': o_q.scale } )
def _quantize_gru(cls, params, in_qs, stats, input_bits, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None opts = kwargs.get('opts', {}) if input_bits == 16: in_out_dtype = np.uint16 else: in_out_dtype = np.uint8 if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] in_q = in_qs[0] cls.check_valid_ranges(params, stats, idx=0, dirs='out') in_edges = G.indexed_in_edges(params.name) names = {val: idx for idx, val in enumerate(GRUParameters.INPUT_NAMES)} # output/state is always Q15 or Q7 symmetric o_q = in_qs[names['h_state']] = QType.from_min_max_sq( min_val=-1, max_val=1, dtype=in_out_dtype, narrow_range=opts['narrow_state']) # set weight qtypes int_num_inp = roundup(params.n_inputs, input_bits == 16) int_num_states = roundup(params.n_states, input_bits == 16) woffs = {} in_q = limit_input_precision(params, input_bits, in_q, int_num_inp, opts['narrow_weights'], opts['weight_bits']) # o_q = limit_input_precision( # params, # input_bits, # o_q, # int_num_states, # opts['narrow_weights'], # opts['weight_bits'], # extra_correction=-1 if opts.get('narrow_state') else 0) for gate in ['z', 'r', 'h']: i_idx = names[f'w_2_{gate}_w'] r_idx = names[f'r_2_{gate}_w'] woffs[gate] = woff_gate = [None, None] woff_gate[0] = calculatate_weight_q( in_qs, in_edges, i_idx, in_q.zero_point[0], (params.n_states, params.n_inputs), (params.n_states, int_num_inp), opts['weight_bits'], opts.get('narrow_weights')) woff_gate[1] = calculatate_weight_q( in_qs, in_edges, r_idx, o_q.zero_point[0], (params.n_states, params.n_states), (params.n_states, int_num_states), opts['weight_bits'], opts.get('narrow_weights')) # get weight scales scale_pairs = { chan: ('w_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['z', 'r', 'h'] } w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] gate_sum_max = [(get_max_or_one(stats[f'range_{gate}_gate_inp']), get_max_or_one(stats[f'range_{gate}_gate_state'])) for gate in ['z', 'r', 'h']] gate_sum_max_bits = [ (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))), np.ceil(np.log2(gsm_r / (o_q.scale * r_w)))) for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales) ] for gate, (max_i, max_r) in zip(['z', 'r', 'h'], gate_sum_max_bits): if np.max(max_i) > 30: LOG.warning( 'max bits in accumulation input %s gate %s - there may be errors', max_i, gate) if np.max(max_r) > 30: LOG.warning( 'max bits in accumulation state %s gate %s - there may be errors', max_i, gate) # LUT activations Q12 -> Q15 act_in_q = 12 act_out_q = 15 int_scale = math.pow(2, -act_in_q) out_tanh_sig_scale = math.pow(2, -act_out_q) scale_qtypes = {} r_pscales = {} i_pscales = {} scale_qtypes['r_pscales'] = r_pscales scale_qtypes['i_pscales'] = i_pscales for gate, w_scale, max_bits in zip(['z', 'r', 'h'], w_scales, gate_sum_max_bits): weight_scale_ratio = w_scale[0] / w_scale[1] # TODO - decide to scale weights equal i_pscales[gate] = w_scale[0] * in_q.scale r_pscales[gate] = w_scale[1] * o_q.scale # h gate input is added manually to state in Q12 if input_bits == 16 or gate == 'h': scale_qtypes[f"w_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / int_scale) else: scale_qtypes[f"w_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / r_pscales[gate]) if input_bits == 16: i_zp_b = woffs[gate][0] if gate == "h": in_qs[names['w_h_b']] = QType( dtype=np.int32, scale=i_pscales[gate], offset=i_zp_b, ) else: i_zp_b = woffs[gate][0] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) if gate == "h": in_qs[names['w_h_b']] = QType( dtype=np.int32, scale=i_pscales[gate] / qscale.qbiases, offset=i_zp_b, ) scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=r_pscales[gate] / int_scale) if gate == 'h': bias_name = 'r_h_b' interleaved_values = None else: bias_name = f'{gate}_b' interleaved_values = [i_zp_b] if input_bits == 16: r_zp_b = woffs[gate][1] in_qs[names[bias_name]] = QType( dtype=np.int32, scale=r_pscales[gate], offset=r_zp_b, interleaved_values=interleaved_values) else: r_zp_b = woffs[gate][1] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) in_qs[names[bias_name]] = QType( dtype=np.int32, scale=r_pscales[gate] / qscale.qbiases, offset=r_zp_b, interleaved_values=interleaved_values) # NOTE - for 16 bit pre-normalize the scales to give us room but make sure it isn't negative if input_bits == 16: gate_prenorm = min( np.min([ np.min(scale_qtypes[f"{inp}_2_{gate}_q"].qnorms) for gate in ['z', 'r', 'h'] for inp in ['w', 'r'] ]), 8) for gate in ['z', 'r', 'h']: for inp in ['w', 'r']: scale_qtypes[ f"{inp}_2_{gate}_q"].pre_normalization = gate_prenorm else: gate_prenorm = 0 scales = { 'i': i_pscales, 'r': r_pscales, 'state': o_q.scale, 'in': in_q.scale, 'act_in': int_scale, 'act_out': out_tanh_sig_scale, 'act_in_q': act_in_q, 'act_out_q': act_out_q } scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32) return QRec.scaled( in_qs=in_qs, out_qs=[o_q], ne16=True, gate_prenorm=gate_prenorm, scales=scales, **scale_qtypes, )
def quantize_ne16(cls, params, in_qs, stats, **kwargs): opts = kwargs['opts'] force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None # note that weights are signed since the zero point of weights is # calculated by NE16. The zero point needs to be removed during # code gen weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.uint8, ne16_order=True, narrow_range=True, bits=opts['weight_bits']) in_q = in_qs[0] # check input quantization and scale asymmetric uint8 if in_q.dtype != np.uint8: # I ignore a force here which is not very clean # if in_q.forced_dtype: # return None cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.uint8, asymmetric=True) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.uint8 if o_q.dtype != np.uint8: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s', params.name, o_q.min, o_q.max, min_val, max_val) else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=np.uint8, asymmetric=True) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=True) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMult.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) # returning the new weights and biases qs will force backprop cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q, ne16=True)
def _quantize_lstm(cls, params, in_qs, stats, input_bits, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None opts = kwargs.get('opts', {}) if input_bits == 16: in_out_dtype = np.uint16 else: in_out_dtype = np.uint8 if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] in_q = in_qs[0] cls.check_valid_ranges(params, stats, idx=0, dirs='out') in_edges = G.indexed_in_edges(params.name) names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } o_q = in_qs[names['i_state']] = QType.from_min_max_sq( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=in_out_dtype, narrow_range=opts['narrow_state']) cell_range = stats.get('range_cell') if cell_range is None: raise ValueError( f'cell range not present in stats for {params.name}') # cell range in minimum 1.0 cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']]) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): msg = ( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) LOG.warning('%s', msg) else: cell_max = cell_stat # this limit is driven by the c_in * f + c * i calculation # c * i will be in Q24 and we want c_in * f to be scaled to the same # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6 cell_limit = pow(2, 6) if cell_max > cell_limit: LOG.warning('Cell state exceeds %s and will be clipped', cell_limit) cell_max = cell_limit cell_int_bits = calc_bits(cell_max) # cell stays signed since it is used in a haddamard with the int32 streamout # in NE16 in_qs[names['c_state']] = QType.from_min_max_sq( -cell_max, cell_max, dtype=np.int16 if input_bits == 16 else np.int8) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # set weight qtypes int_num_inp = roundup(params.n_inputs, input_bits == 16) int_num_states = roundup(params.n_states, input_bits == 16) woffs = {} in_q = limit_input_precision(params, input_bits, in_q, int_num_inp, opts['narrow_weights'], opts['weight_bits']) o_q = limit_input_precision( params, input_bits, o_q, int_num_states, opts['narrow_weights'], opts['weight_bits'], extra_correction=-1 if opts.get('narrow_state') else 0) for gate in ['i', 'o', 'c', 'f']: i_idx = names[f'i_2_{gate}_w'] r_idx = names[f'r_2_{gate}_w'] woffs[gate] = woff_gate = [None, None] woff_gate[0] = calculatate_weight_q( in_qs, in_edges, i_idx, in_q.zero_point[0], (params.n_states, params.n_inputs), (params.n_states, int_num_inp), opts['weight_bits'], opts.get('narrow_weights')) woff_gate[1] = calculatate_weight_q( in_qs, in_edges, r_idx, o_q.zero_point[0], (params.n_states, params.n_states), (params.n_states, int_num_states), opts['weight_bits'], opts.get('narrow_weights')) # get weight scales scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] gate_sum_max = [(get_max_or_one(stats[f'range_{gate}_gate_i']), get_max_or_one(stats[f'range_{gate}_gate_r'])) for gate in ['i', 'o', 'c', 'f']] gate_sum_max_bits = [ (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))), np.ceil(np.log2(gsm_r / (o_q.scale * r_w)))) for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales) ] for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'], gate_sum_max_bits): if np.max(max_i) > 30: LOG.warning( 'max bits in accumulation input %s gate %s - there may be errors', max_i, gate) if np.max(max_r) > 30: LOG.warning( 'max bits in accumulation state %s gate %s - there may be errors', max_i, gate) # LUT activations Q12 -> Q15 act_in_q = 12 act_out_q = 15 int_scale = math.pow(2, -act_in_q) out_tanh_sig_scale = math.pow(2, -act_out_q) scale_qtypes = {} r_pscales = {} i_pscales = {} scale_qtypes['r_pscales'] = r_pscales scale_qtypes['i_pscales'] = i_pscales for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales, gate_sum_max_bits): weight_scale_ratio = w_scale[0] / w_scale[1] # TODO - decide to scale weights equal i_pscales[gate] = w_scale[0] * in_q.scale r_pscales[gate] = w_scale[1] * o_q.scale if input_bits == 16: scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / int_scale) else: scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / r_pscales[gate]) if input_bits == 16: i_zp_b = woffs[gate][0] else: i_zp_b = woffs[gate][0] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=r_pscales[gate] / int_scale) if input_bits == 16: r_zp_b = woffs[gate][1] in_qs[names[f'{gate}_b']] = QType(dtype=np.int32, scale=r_pscales[gate], offset=r_zp_b, interleaved_values=[i_zp_b]) else: r_zp_b = woffs[gate][1] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) in_qs[names[f'{gate}_b']] = QType(dtype=np.int32, scale=r_pscales[gate] / qscale.qbiases, offset=r_zp_b, interleaved_values=[i_zp_b]) # NOTE - for 16 bit pre-normalize the scales to give us room but make sure it isn't negative if input_bits == 16: gate_prenorm = min( np.min([ np.min(scale_qtypes[f"{inp}_2_{gate}_q"].qnorms) for gate in ['i', 'o', 'c', 'f'] for inp in ['i', 'r'] ]), 8) for gate in ['i', 'o', 'c', 'f']: for inp in ['i', 'r']: scale_qtypes[ f"{inp}_2_{gate}_q"].pre_normalization = gate_prenorm else: gate_prenorm = 0 r_pscales['state_out_scale'] = o_q.scale r_pscales['int_scale'] = int_scale # ct = c_in * f + c * i # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12 # scale(c_in * f) = Qcell * Q15 (prenorm if 16bit) and scale -> Q12 # ((c_in * f) + (c * i)) in Q12 # scale -> cell_out # tan(ct) -> Q15 # o * tan(ct) -> Q30 # prenorm and scale # scale result of c_state_1 * f_gate -> Q15 cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale / out_tanh_sig_scale) # cell_out from Q15 -> Q7/Q15 scaled cell_out_scale = out_tanh_sig_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / o_q.scale r_pscales['act_out_scale'] = out_tanh_sig_scale r_pscales['c_before_scale'] = int_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # NOTE - for 16 bit pre-normalize the scales to give us room if input_bits == 16: scale_qtypes['cell_in_q'].pre_normalization = 8 scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32) if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, ne16=True, gate_prenorm=gate_prenorm, **scale_qtypes, )