def set_add_in_scale(qrec): scaled_idx = qrec.cache.get('scaled_idx') if scaled_idx is None: scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0) qrec.cache['scaled_idx'] = scaled_idx compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1) scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q') if scale_in_mul_biases_q is None: scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q not_scaled_idx = 0 if scaled_idx else 1 scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale scale_in_mul_biases_q.scale = scale if qrec.in_qs[0].asymmetric: # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb = # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc = # = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa)) # |---------- bias ----------| add_bias = qrec.out_qs[ 0].zero_point - qrec.cache['scale_mul_biases_q'].scale * ( qrec.in_qs[not_scaled_idx].zero_point + scale_in_mul_biases_q.scale * qrec.in_qs[scaled_idx].zero_point) else: add_bias = 0 qrec.cache['add_bias_offset'] = np.round(add_bias).astype(np.int16)
def _quantize(cls, params, in_qs, stats, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if the second input is constant # if len(in_qs) > 2: in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if in2_node: kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.int8, narrow_range=True, bits=8) else: in_q2 = in_qs[1].make_symmetric_signed() in_q1 = in_qs[0].make_symmetric_signed() min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 if o_q.dtype != np.int8 or o_q.asymmetric: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale) out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q)
def set_add_in_scale(qrec): scaled_idx = qrec.cache.get('scaled_idx') if scaled_idx is None: scaled_idx = (1 if qrec.in_qs[1].scale > qrec.in_qs[0].scale else 0) qrec.cache['scaled_idx'] = scaled_idx compute_in_out_scale(qrec, in_idx=0 if scaled_idx else 1) scale_in_mul_biases_q = qrec.cache.get('scale_in_mul_biases_q') if scale_in_mul_biases_q is None: scale_in_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_in_mul_biases_q'] = scale_in_mul_biases_q not_scaled_idx = 0 if scaled_idx else 1 scale = qrec.in_qs[scaled_idx].scale / qrec.in_qs[not_scaled_idx].scale scale_in_mul_biases_q.scale = scale
def compute_in_out_scale(qrec, in_idx=0, out_idx=0, extra_scale=1): if isinstance(in_idx, int): in_scale = qrec.in_qs[in_idx].scale else: in_scale = reduce(lambda x, y: x * y, [qrec.in_qs[idx].scale for idx in in_idx]) if isinstance(out_idx, int): out_scale = qrec.out_qs[out_idx].scale else: out_scale = reduce(lambda x, y: x * y, [qrec.out_qs[idx].scale for idx in out_idx]) scale_mul_biases_q = qrec.cache.get('scale_mul_biases_q') if scale_mul_biases_q is None: scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q scale = in_scale * extra_scale / out_scale scale_mul_biases_q.scale = scale
def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8 force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if we have a bias in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if not in2_node: raise ValueError( f"Not supported in NE16 this matmul {params.name}") w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1] h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1] h2_padded = roundup(h2, input_bits == 16) kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.uint8, narrow_range=True, bit_pack=opts['weight_bits'], no_compression=True, bits=opts['weight_bits'], resize=((h2, w2), (h2_padded, w2))) in_q1 = QType.from_min_max_sq(in_qs[0].min_val, in_qs[0].max_val, dtype=input_dtype, asymmetric=True) in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False, opts['weight_bits']) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: force_output_size = opts.get('force_output_size', 8) out_dtype = np.uint8 if force_output_size == 8 else np.uint16 o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dont_copy_attr=['ne16'], asymmetric=True, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale, ne16_biases=(input_bits != 16)) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form bias_offset = np.zeros((in2_node.dqvalue.shape[0], ), dtype=np.int32) if in_q1.zero_point != 0: # input zero correction is sum(W * Zin) by out_c if weights are channel scaled bias_offset -= np.sum( np.multiply(in_q1.zero_point, in2_node.value_as(in_q2).astype(np.int32) - in_q2.zero_point, dtype=np.int32), dtype=np.int32, axis=1) if o_q.zero_point != 0: # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled scale = o_q.scale / (in_q1.scale * in_q2.scale) bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype( np.int32) if not np.all(bias_offset == 0): biases_q.offset = bias_offset out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale o_q.attr.ne16 = True if input_bits == 16: prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8) else: prenorm = 0 mul_biases_q.pre_normalization = prenorm return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q, ne16=True)