def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] opts = kwargs['opts'] fusion = kwargs.get('fusion', None) G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.int8, narrow_range=opts['narrow_weights']) if fusion and fusion.fusion_type in [ 'conv_active_pool', 'conv_active' ]: stats = kwargs['all_stats'][NodeId(fusion, fusion.contained_nodes()[0])] if isinstance( fusion.contained_nodes()[1], (SigmoidActivationParameters, TanHActivationParameters, HSwishActivationParameters)): stats = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[0])] elif fusion and isinstance(fusion.contained_nodes()[1], HSigmoidActivationParameters): # Hard sigmoid implements a RELU, be sure 6 can be representable min_val, max_val = 0, 6 else: # Take stats from activation after the convolution stats = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[1])] if min_val is None or max_val is None: min_val, max_val = stats['range_out'][0]['min'], stats[ 'range_out'][0]['max'] if force_out_q: o_q = force_out_q else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) mul_biases_q = MultMulBiasScaleQType.from_filter( in_qs[0], weights_q, o_q, params) # returning the new weights and biases qs will force backprop # TODO - ACC_Q LOOKS WRONG AFTER THIS return MultScalableFilterQuantizationRecord( in_qs=[in_qs[0], weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q)
def _quantize(cls, params, in_qs, stats, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if the second input is constant # if len(in_qs) > 2: in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if in2_node: kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.int8, narrow_range=True, bits=8) else: in_q2 = in_qs[1].make_symmetric_signed() in_q1 = in_qs[0].make_symmetric_signed() min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 if o_q.dtype != np.int8 or o_q.asymmetric: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale) out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] # if forced set what we are forced to if force_out_q: o_q = deepcopy(force_out_q) # if value is already quantized then keep the same quantization elif params.qtype: o_q = deepcopy(params.qtype) # derive quantization from statistics else: o_q = QType.from_array_sq(params.value, dtype=out_dtype) o_q.is_constant = True return QRec.scaled(out_qs=[o_q])
def calculatate_weight_q(in_qs, in_edges, w_idx, in_zero_point, real_dim, padded_dim, qw, narrow): # calculates weight qtype and zero offset bias correction wnode = in_edges[w_idx].from_node extra_attrs = {'bit_pack': qw} if qw < 8 else {} in_qs[w_idx] = QType.from_array_sq( wnode.dqvalue, dtype=np.uint8, bits=qw, narrow_range=narrow, quantized_dimension=0, resize=( real_dim, padded_dim ), ne16_decode={ 'type': 'RNN', 'Ko': real_dim[0], 'KiReal': real_dim[1], 'Ki': padded_dim[1], 'Qw': qw }, no_compression=True, **extra_attrs) w_q = in_qs[w_idx] # since the weight zero offset is added by NE16 use signed value weight_val = wnode.value_as(w_q).astype(np.int32) - w_q.zero_point # return zero offset return np.sum( -in_zero_point.astype(np.int32) * weight_val, axis=1, dtype=np.int32)
def quantize_ne16(cls, params, in_qs, stats, **kwargs): opts = kwargs['opts'] force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None # note that weights are signed since the zero point of weights is # calculated by NE16. The zero point needs to be removed during # code gen weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.uint8, ne16_order=True, narrow_range=True, bits=opts['weight_bits']) in_q = in_qs[0] # check input quantization and scale asymmetric uint8 if in_q.dtype != np.uint8: # I ignore a force here which is not very clean # if in_q.forced_dtype: # return None cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.uint8, asymmetric=True) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.uint8 if o_q.dtype != np.uint8: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s', params.name, o_q.min, o_q.max, min_val, max_val) else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=np.uint8, asymmetric=True) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=True) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMult.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) # returning the new weights and biases qs will force backprop cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q, ne16=True)
def _quantize(cls, params, in_qs, stats, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) if cls.can_ne16(params, opts, fusion): LOG.info('selecting USQ8 NE16 kernel filter quantizer') return cls.quantize_ne16(params, in_qs, stats, **kwargs) LOG.info('selecting SQ8 software kernel filter quantizer') force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] in_q = in_qs[0] # check input quantization and int8 type # if not padded we can scale asymmetric if in_q.dtype == np.uint8: # handle NE16 cls.check_valid_ranges(params, stats, idx=0, dirs='in') # allow asymmetric if not padded if isinstance(params, Conv2DParameters) and params.padding.has_padding: in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, forced=True) else: in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, zero_point=in_q.zero_point - 128) elif (isinstance(params, Conv2DParameters) and not in_q.is_symmetric and params.padding.has_padding): cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8) # if not forced we can try asymmetric elif (opts['allow_asymmetric'] and isinstance(params, Conv2DParameters) and not in_q.forced and in_q.is_symmetric and not params.padding.has_padding): cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=np.int8, asymmetric=True) if opts['weight_bits'] != 8: LOG.warning( 'sub byte weights quantization requested but NE16 kernel not selected' ) weights_node = cls.get_weights_node(G, fusion if fusion else params) weights_q = QType.from_array_sq( arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension(params, opts), dtype=np.int8, narrow_range=opts['narrow_weights'], bits=opts['weight_bits']) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 if o_q.dtype != np.int8: return None LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.is_asymmetric else "symmetric") else: o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=out_dtype, asymmetric=opts['allow_asymmetric']) biases_q = QType(dtype=np.int32, scale=weights_q.scale * in_q.scale) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # returning the new weights and biases qs will force backprop # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMult.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) if not (opts['allow_asymmetric'] or force_out_q or biases_q.offset is None): raise ValueError( f'bias offset is set but asymmetric is disallowed in {params.name}' ) # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) if isinstance(params, Conv2DParameters) and params.padding.has_padding: in_q.set_forced(flags=['zero_point']) cls.check_order(params, AT_SW_KER_IN_ORDER, AT_SW_KER_OUT_ORDER) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q)
def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() opts = kwargs['opts'] fusion = kwargs.get('fusion', None) input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8 force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] # only attempt channel scaling if we have a bias in2_node, in_qs = cls.move_constant(G, fusion if fusion else params, in_qs) if not in2_node: raise ValueError( f"Not supported in NE16 this matmul {params.name}") w1, h1 = params.in_dims[0].shape[0], params.in_dims[0].shape[1] h2, w2 = params.in_dims[1].shape[0], params.in_dims[1].shape[1] h2_padded = roundup(h2, input_bits == 16) kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq(arr=in2_node.dqvalue, quantized_dimension=0, dtype=np.uint8, narrow_range=True, bit_pack=opts['weight_bits'], no_compression=True, bits=opts['weight_bits'], resize=((h2, w2), (h2_padded, w2))) in_q1 = QType.from_min_max_sq(in_qs[0].min_val, in_qs[0].max_val, dtype=input_dtype, asymmetric=True) in_q1 = limit_input_precision(params, input_bits, in_q1, w1, False, opts['weight_bits']) min_val, max_val = cls.get_min_max(fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = force_out_q LOG.warning( 'node %s output forced to range %s/%s - actual range %s/%s %s', params.name, o_q.min, o_q.max, min_val, max_val, "asymmetric" if o_q.asymmetric else "symmetric") else: force_output_size = opts.get('force_output_size', 8) out_dtype = np.uint8 if force_output_size == 8 else np.uint16 o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dont_copy_attr=['ne16'], asymmetric=True, dtype=out_dtype) if len(in_qs) == 3: biases_q = QType(dtype=np.int32, scale=in_q1.scale * in_q2.scale, ne16_biases=(input_bits != 16)) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form bias_offset = np.zeros((in2_node.dqvalue.shape[0], ), dtype=np.int32) if in_q1.zero_point != 0: # input zero correction is sum(W * Zin) by out_c if weights are channel scaled bias_offset -= np.sum( np.multiply(in_q1.zero_point, in2_node.value_as(in_q2).astype(np.int32) - in_q2.zero_point, dtype=np.int32), dtype=np.int32, axis=1) if o_q.zero_point != 0: # output zero correction is So/(Si * Sw) * ZPo by out_c if weights are channel scaled scale = o_q.scale / (in_q1.scale * in_q2.scale) bias_offset += np.floor((o_q.zero_point * scale) + 0.5).astype( np.int32) if not np.all(bias_offset == 0): biases_q.offset = bias_offset out_in_qs = [in_q1, in_q2, biases_q] else: out_in_qs = [in_q1, in_q2] mul_biases_q = MultMulBiasScaleQType() mul_biases_q.scale = in_q1.scale * in_q2.scale / o_q.scale o_q.attr.ne16 = True if input_bits == 16: prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8) else: prenorm = 0 mul_biases_q.pre_normalization = prenorm return QRec.scaled(in_qs=out_in_qs, out_qs=[o_q], mul_biases_q=mul_biases_q, ne16=True)
def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): # copy in_qs because we may modify it in_qs = in_qs.copy() input_bits = 16 if input_dtype in (np.uint16, np.int16) else 8 opts = kwargs['opts'] fusion = kwargs.get('fusion', None) LOG.info('selecting USQ8 NE16 kernel filter quantizer') force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None weights_q = QType.from_array_sq(arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension( params, opts), dtype=np.uint8, narrow_range=True, bit_pack=opts['weight_bits'], no_compression=True, bits=opts['weight_bits']) in_q = in_qs[0] in_q = limit_input_precision( params, input_bits, in_q, params.filter.sz, opts['narrow_weights'], opts['weight_bits']) # input dtype is either uint8 or int8 if in_q.dtype != input_dtype: if in_q.forced_dtype: return None cls.check_valid_ranges(params, stats, idx=0, dirs='in') in_q = QType.from_min_max_sq(stats['range_in'][0]['min'], stats['range_in'][0]['max'], dtype=input_dtype, asymmetric=False) min_val, max_val = cls.get_min_max( fusion, stats, kwargs['all_stats'], params) if force_out_q: o_q = deepcopy(force_out_q) o_q.dont_copy_attr = ['ne16'] LOG.warning('node %s output forced to range %s/%s - actual range %s/%s', params.name, o_q.min, o_q.max, min_val, max_val) else: force_output_size = opts.get('force_output_size', 8) output_dtype = np.uint8 if force_output_size == 8 else np.uint16 o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=output_dtype, dont_copy_attr=['ne16'], asymmetric=True) o_q.attr.ne16 = True biases_q = QType( dtype=np.int32, scale=weights_q.scale * in_q.scale, ne16_biases=(input_bits!=16)) mul_biases_q = MultMulBiasScaleQType.from_filter( in_q, weights_q, o_q, params) # calculate bias offset - this will be added to the bias in the kernel # it is already in quantized form biases_q.offset = FilterMultNE16Base.calculate_bias_offset( params, in_q, weights_node, weights_q, o_q) # returning the new weights and biases qs will force backprop cls.check_order(params, AT_NE16_KER_IN_ORDER, AT_NE16_KER_OUT_ORDER) if input_bits == 16: prenorm = min(np.min(np.min(mul_biases_q.qnorms)), 8) else: prenorm = 0 mul_biases_q.pre_normalization = prenorm # o_q.set_forced(flags=['dtype']) # in_q.set_forced(flags=['dtype']) return QRec.scaled(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], acc_q=biases_q, calc_q=biases_q, mul_biases_q=mul_biases_q, ne16=True)