def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_pow2_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if params.activation == "relu6": int_bits = calc_bits(6) elif params.activation == "relun": relun = params.activation_params if isinstance(relun, list): relun = max(relun) int_bits = calc_bits(relun) elif params.activation == "relu" or params.activation == "hswish" or params.activation == "hsigmoid" or params.activation == "leaky": int_bits = bits(stats['range_out'][0]['max'], stats['range_out'][0]['min']) else: raise ValueError( f'no support for activation {params.activation} in POW2 quantizer' ) in_q = in_qs[0] if force_out_q is None: q = max(cls.get_pow2_bits(**kwargs) - int_bits, 0) out_q = QType(q=q, dtype=out_dtype) else: if force_out_q.bits - force_out_q.q < int_bits: LOG.warning( 'quantization is forcing node %s to have an output that may clip', params.name) out_q = force_out_q return SymmetricQuantizationRecord(in_qs=[in_q], out_qs=[out_q])
def generate_tanh(var, scaling): if scaling: # what is the current maximum value of the input? # We want to: (a) represent (1) precisely # (b) make sure that scaling to this rep does not overflow # Find the closest power of 2 greater than the current scale closest_repr = math.log2(var.scale) closest_repr = min(math.floor(closest_repr), -7) new_scale = pow(2, closest_repr) cur_max_val = math.ceil(pow(2, var.ibits) * var.scale) new_scaled_max_val = math.ceil(cur_max_val / new_scale) assert calc_bits( new_scaled_max_val) + var.q <= 31, "risk of overflow in htanh" new_q = 0 return ExprState(HTanh( ATScale.from_scales(var.expr, var.scale, new_scale, 28 - var.length, to_q=new_q, from_q=var.q), new_q, new_scale), abs(closest_repr) + 1, q=new_q, scale=new_scale) return ExprState(HTanh(var.expr, None, None), var.ibits)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_pow2_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) if not fusion and in_qs[0].dtype == np.int32: return None if params.activation == "relu6": int_bits = calc_bits(6) elif params.activation == "relun": relun = params.activation_params if isinstance(relun, list): relun = max(relun) int_bits = calc_bits(relun) elif params.activation in [ "relu", "hswish", "hsigmoid", "leaky", "htanh" ]: cls.check_valid_ranges(params, stats, idx=0, dirs='out') int_bits = calc_bits(stats['range_out'][0]['max'], stats['range_out'][0]['min']) elif params.activation == "sigmoid" or params.activation == "tanh": if force_out_q is None: q = 7 if out_dtype == np.int8 else 15 return QRec.symmetric(in_qs=[in_qs[0]], out_qs=[QType(q=q, dtype=out_dtype)]) else: q = 7 if force_out_q.dtype == np.int8 else 15 if force_out_q.q != q: return None return QRec.symmetric(in_qs=[in_qs[0]], out_qs=[force_out_q]) else: LOG.error( f'no support for activation {params.activation} in POW2 quantizer' ) return None in_q = in_qs[0] if force_out_q is None: q = max(cls.get_pow2_bits(**kwargs) - int_bits, 0) out_q = QType(q=q, dtype=out_dtype) else: if force_out_q.bits - force_out_q.q < int_bits: return None out_q = force_out_q return QRec.symmetric(in_qs=[in_q], out_qs=[out_q])
def compute_activation_out_maxq(node, num_bits): relun = None if node.activation == "relu6": relun = 6 elif node.activation == "relun": relun = node.activation_params if isinstance(relun, list): relun = max(relun) if relun is None: return None relu_bits = calc_bits(relun) return num_bits - relu_bits
def astats(size, do_bits=True): """Extracts statistics from a tensor """ ret = { 'mean': 0, 'std': 0.25, 'min': -0.9, 'max': 0.9, 'size': size, 'wols': 0, 'sols': 0, 'min_out': 0, 'max_out': 0, } if do_bits: ret['ibits'] = calc_bits(0.9, -0.9) return ret
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0) if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] opts = kwargs['opts'] cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) if force_out_qs and force_out_qs[0]: LOG.warning( 'on node %s output is being forced from scale %s -> %s', params.name, o_q.scale, force_out_qs[0].scale) o_q = force_out_qs[0] names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } cell_range = stats.get('range_cell') if cell_range is None: ValueError(f'cell range not present in stats for {params.name}') cell_stat = max(abs(cell_range[var]) for var in ['min', 'max']) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): LOG.warning( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) else: cell_max = cell_stat cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) int2_scale = int3_scale = out_tanh_sig_scale = None if params.hard_act: # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((16 - cell_int_bits), 10) int2_scale = math.pow(2, -(int_q * 2)) int3_scale = math.pow(2, -(int_q * 3)) else: int_q = 12 # output of LUT activations are always Q15 out_tanh_sig_scale = math.pow(2, -15) int_scale = math.pow(2, -int_q) scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } for weight_name in [ weight_name for scale_pair in scale_pairs.values() for weight_name in scale_pair ]: in_qs[names[weight_name]] = deepcopy(in_qs[names[weight_name]]) in_qs[names[weight_name]].dtype = np.int8 in_qs[names[weight_name]].bits = opts['weight_bits'] w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] if (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \ all([(abs(1 - w_scale[0] / w_scale[1]) < 0.2) for w_scale in w_scales]): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: if not (abs(1 - in_qs[0].scale / o_q.scale) < 0.1) and \ not all([(abs(1 - w_scale[0] / w_scale[1]) < 0.1) for w_scale in w_scales]): LOG.warning( "node %s has different input and i_state scales consider using the " "LSTM kernel with rnn_same_inout_scale=False (better accuracy)", params.name) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) # i_state scale may be 1 since the value is 0 # np.maximum(in_and_out_scale, in_qs[names['i_state']].scale) i_state_scale = in_scale = in_and_out_scale in_qs[0].scale = in_scale o_q.scale = in_scale scales = { k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items() } for k, (namei, namer) in scale_pairs.items(): in_qs[names[namei]].scale = scales[k] in_qs[names[namer]].scale = scales[k] else: in_scale = in_qs[0].scale i_state_scale = o_q.scale o_q.scale = i_state_scale if not params.rnn_states_as_inputs: in_qs[names['i_state']].scale = i_state_scale # compute scales for perceptrons r_pscales = { k: in_qs[names["r_2_%s_w" % k]].scale * i_state_scale for k in ['i', 'o', 'c', 'f'] } scale_qtypes = { "r_2_%s_q" % k: MultMulBiasScaleQType(scale=r_pscale / int_scale) for k, r_pscale in r_pscales.items() } i_pscales = { k: in_qs[names["i_2_%s_w" % k]].scale * in_scale for k in ['i', 'o', 'c', 'f'] } # if input and i_state have different scales -> scale the inputs before sum # otherwise do nothing and these scales will be ignored scale_qtypes.update({ "i_2_%s_q" % k: MultMulBiasScaleQType(scale=i_pscale / r_pscale) for (k, i_pscale ), r_pscale in zip(i_pscales.items(), r_pscales.values()) }) if params.hard_act: cell_in_scale = in_qs[names['c_state']].scale / int_scale cell_out_scale = int2_scale / in_qs[names['c_state']].scale state_out_scale = int3_scale / i_state_scale else: cell_in_scale = in_qs[ names['c_state']].scale * out_tanh_sig_scale / int_scale cell_out_scale = int_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / i_state_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for gate in ['i', 'o', 'c', 'f']: in_qs[names[f"{gate}_b"]].scale = r_pscales[gate] in_qs[names[f"{gate}_b"]].dtype = np.int32 if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, **scale_qtypes, )
def from_min_max(cls, min_val, max_val, dtype=None, bits=None, scaled=False, asymmetric=False, narrow_range=False, quantized_dimension=None, scale_zero_as_one=False, forced=False, zero_point=None, **kwargs): min_val = cls.init_array(min_val) max_val = cls.init_array(max_val) # check for scalar min_max_equal = np.isclose(min_val, max_val) #min_max_equal = min_val == max_val max_val = np.where(np.logical_and(min_max_equal, min_val < 0), -(min_val), max_val) min_val = np.where(np.logical_and(min_max_equal, min_val > 0), -(max_val), min_val) max_val = np.where(np.logical_and(min_max_equal, min_val == 0), 1, max_val) min_val = np.where(np.logical_and(min_max_equal, min_val == 0), 1, min_val) # zero must be representable min_val = np.where(min_val > 0, 0, min_val) max_val = np.where(max_val < 0, 0, max_val) # work out container if dtype is None: dtype = np.int8 if scaled else np.int16 dtype_bits, signed = DTYPES[dtype] if bits is None: bits = dtype_bits elif bits > dtype_bits: raise ValueError(f'bits {bits} do not fit in dtype {dtype}') if scaled: qmin, qmax = cls.calculate_quantized_range( bits, narrow_range=narrow_range, signed=signed) scale, zero_point = cls.calculate_scale( min_val, max_val, qmin, qmax, dtype, asymmetric=asymmetric, scale_zero_as_one=scale_zero_as_one, narrow_range=narrow_range, zero_point=zero_point) if len(scale) == 1: quantized_dimension = None return cls(bits=bits, signed=signed, dtype=dtype, scale=scale, zero_point=zero_point, quantized_dimension=quantized_dimension, min_val=min_val, max_val=max_val, narrow_range=narrow_range, forced=forced, asymmetric=asymmetric, **kwargs) else: if asymmetric: raise ValueError( 'asymmetric is not supported un unscaled mode') if quantized_dimension is not None: raise ValueError( 'quantized dimension is not supported un unscaled mode') int_bits = calc_bits(max_val, min_val, signed=signed) if int_bits > bits: raise ValueError( f"{max_val}, {min_val} number cannot be represented with this many bits" ) return cls(bits=bits, q=bits - int_bits, signed=signed, dtype=dtype, min_val=min_val, max_val=max_val, narrow_range=narrow_range, forced=forced, asymmetric=asymmetric, **kwargs)
def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): if out_dtype is None: out_dtype = dtype if isinstance(node, (PoolingParameters, OutputParameters, SplitParameters)): o_q = in_qs[0] elif isinstance(node, SoftMaxParameters): o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) else: o_q = SymmetricMultQType.from_min_max(min_val=astats['range_out'][0]['min'], max_val=astats['range_out'][0]['max'], dtype=out_dtype) if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, ExpressionFusionParameters): o_qs = [SymmetricMultQType.from_min_max(min_val=orange['min'], max_val=orange['max'], dtype=out_dtype) for orange in astats['range_out']] fusion_inputs = sorted([n for n in node.subgraph.inputs() if isinstance(n, FusionInputParameters)], key=lambda x: x.idx) fusion_outputs = sorted([n for n in node.subgraph.outputs() if isinstance(n, FusionOutputParameters)], key=lambda x: x.idx) node_scale_map = {fnode: in_qs[idx].scale for idx, fnode in enumerate(fusion_inputs)} for idx, fnode in enumerate(fusion_outputs): node_scale_map[fnode] = o_qs[idx].scale inp, outp, expr = node.decompose(node_scale_map=node_scale_map) qrec = MultExpressionQuantizationRecord(in_qs=in_qs, out_qs=o_qs, inputs=inp, output_exprs=outp, intermediate_exprs=expr) elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, SplitParameters): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]*node.num_splits) elif isinstance(node, ConstantInputParameters): if node.value_quantization: qrec = MultConstantQuantizationRecord(out_qs=[node.value_quantization], constants_are_quantized=True) else: qrec = MultConstantQuantizationRecord(out_qs=[o_q], constants_are_quantized=False) elif isinstance(node, (FcParameters, Conv2DParameters)): weights_q = SymmetricMultQType.from_array(arr=node.weights, quantized_dimension=self.get_quantized_dimension( node), dtype=dtype, narrow_range=self._narrow_weights) if node.has_bias: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) else: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=np.array([1], dtype=np.int32)) mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], out_qs=[o_q], weights_q=weights_q, biases_q=biases_q, mul_biases_q=mul_biases_q, constants_are_quantized=False) LOG.debug("filter %s qrec %s", node.name, qrec) elif isinstance(node, RNNParameters): input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit opts = self.get_options(node) if opts['mode'] == "extended": in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale state_w_scale = in_qs[names['r_2_i_w']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale/state_w_scale) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale) s_2_o_q = MultMulBiasScaleQType(scale=1/o_q.scale) self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], i_2_a_q=i_2_a_q, s_2_s_q=s_2_s_q, s_2_o_q=s_2_o_q ) elif opts['mode'] == 'autotiler': in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_w_scale = np.maximum( in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) self.rescale_constant(input_nodes['i_2_i_w'], in_and_state_w_scale) self.rescale_constant(input_nodes['r_2_i_w'], in_and_state_w_scale) state_w_scale = in_and_state_scale * in_and_state_w_scale self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale/in_and_state_scale) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, ) elif isinstance(node, LSTMParameters): input_nodes = {LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES)} if node.cell_clip: cell_max = node.cell_clip else: cell_max = max(abs(astats['range_cell'][var]) for var in ['min', 'max']) cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((32-cell_int_bits)//2, 10) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_scale = np.maximum(in_and_out_scale, in_qs[names['i_state']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) scale_pairs = {chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f']} scales = {k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()} for k, (namei, namer) in scale_pairs.items(): self.rescale_constant(input_nodes[namei], scales[k]) self.rescale_constant(input_nodes[namer], scales[k]) int_scale = pow(2, -int_q) int2_scale = pow(2, -(int_q*2)) int3_scale = pow(2, -(int_q*3)) # compute scales for perceptrons pscales = {k: scales[k] * in_and_state_scale for k in ['i', 'o', 'c', 'f']} scale_qtypes = {"r_2_%s_q" % k: MultMulBiasScaleQType( scale=pscale/int_scale) for k, pscale in pscales.items()} scale_qtypes['cell_in_q'] = MultMulBiasScaleQType( scale=in_qs[names['c_state']].scale/int_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=int2_scale/in_qs[names['c_state']].scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType(scale=int3_scale/in_and_state_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for k in ['i', 'o', 'c', 'f']: self.rescale_constant(input_nodes["%s_b" % k], pscales[k], dtype=np.int32) qrec = MultScalableLstmQuantizationRecord( in_qs=in_qs, out_qs=[o_q], **scale_qtypes, ) else: qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) return qrec
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, params_dtype = cls.get_pow2_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) pow2_biases = kwargs.get('opts')['pow2_biases'] G = kwargs['G'] weights_node, biases_node = cls.get_weights_and_biases_nodes( G, fusion if fusion else params) range_acc = stats.get('range_acc', stats['range_out'][0]) conv_active = fusion and fusion.fusion_type in [ 'conv_active_pool', 'conv_active' ] int_dtype = np.int32 cls.check_valid_ranges(params, stats, idx=0, dirs='out') if conv_active: # Take stats from activation after the convolution range_out = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[1])]['range_out'][0] out_dtype = np.int32 else: out_dtype = params_dtype range_out = stats['range_out'][0] in_q = deepcopy(in_qs[0]).scale_to_pow2() calc_width = 31 o_q = QType.from_min_max_pow2(range_out['min'], range_out['max'], dtype=out_dtype) if force_out_q: if o_q.scale > force_out_q.scale: return None weights_q = QType.from_array_pow2(arr=weights_node.dqvalue, dtype=params_dtype) calc_q = in_q.q + weights_q.q acc_bits = calc_bits(range_acc['max'], range_acc['min']) act_bits = calc_bits(range_out['min'], range_out['max']) act_acc_bits = max(acc_bits, act_bits) calc_int_bits = calc_width - calc_q if calc_int_bits < act_acc_bits: # we don't have enough space for the integer portion so reduce the precision of # the weights and input missing_bits = act_acc_bits - calc_int_bits if missing_bits > calc_q * 0.75: raise ValueError( f'Quantizing {params.name} at this precision will loose more than 75% of fractional part' ) prec_inp = min(math.floor(0.5 + missing_bits * in_q.q / calc_q), in_q.q) prec_w = min(math.floor(0.5 + missing_bits * weights_q.q / calc_q), weights_q.q) left = missing_bits - prec_inp - prec_w if left > 0: prec_w += left LOG.warning( 'reducing weight and input precision (%s, %s) in %s to satisfy quantization constraints', prec_w, prec_inp, params.name) weights_q.q -= prec_w in_q.q -= prec_inp calc_q = in_q.q + weights_q.q calc_int_bits = calc_width - calc_q c_q = acc_q = QType(bits=calc_width, q=calc_q, signed=True) if conv_active: o_q = c_q if pow2_biases == 0: biases_dtype = params_dtype elif pow2_biases == 8: biases_dtype = np.int8 elif pow2_biases == 16: biases_dtype = np.int16 else: biases_dtype = np.int32 biases_q = QType.from_array_pow2(arr=biases_node.dqvalue, dtype=biases_dtype) # make sure that the biases are not stored more precisily than the accumulator. It's pointless and will # cause a negative shift if biases_q.q > acc_q.q: biases_q.q = acc_q.q if isinstance(params, MultiplicativeBiasParameters) and params.has_mul_bias: mb_q = QType.from_array_pow2(arr=params.mul_biases, dtype=int_dtype) else: mb_q = None return QRec.symmetric(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], calc_q=c_q, acc_q=acc_q, mul_biases_q=mb_q)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, params_dtype = cls.get_pow2_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) pow2_biases = kwargs.get('opts')['pow2_biases'] G = kwargs['G'] weights_node, biases_node = cls.get_weights_and_biases_nodes( G, fusion if fusion else params) range_acc = stats['range_acc'] conv_active = fusion and fusion.fusion_type in [ 'conv_active_pool', 'conv_active' ] int_dtype = np.int32 cls.check_valid_ranges(params, stats, idx=0, dirs='out') if conv_active: # Take stats from activation after the convolution range_out = kwargs['all_stats'][NodeId( fusion, fusion.contained_nodes()[1])]['range_out'][0] out_dtype = np.int32 else: out_dtype = params_dtype range_out = stats['range_out'][0] in_q = deepcopy(in_qs[0]).scale_to_pow2() calc_width = 32 if force_out_q: o_q = force_out_q else: o_q = QType.from_min_max_pow2(range_out['min'], range_out['max'], dtype=out_dtype) weights_q = QType.from_array_pow2(arr=weights_node.dqvalue, dtype=params_dtype) calc_q = in_q.q + weights_q.q acc_bits = calc_bits(range_acc['max'], range_acc['min']) act_bits = calc_bits(range_out['min'], range_out['max']) act_acc_bits = max(acc_bits, act_bits) calc_int_bits = calc_width - calc_q if calc_int_bits < act_acc_bits: # we don't have enough space for the integer portion so reduce the precision of # the weights missing_bits = act_acc_bits - calc_int_bits # TODO - This needs improving assert weights_q.q >= missing_bits, "no space in weights to reduce precision" LOG.warning( 'reducing weight precision in %s to satisfy quantization constraints', params.name) weights_q.q = weights_q.q - missing_bits calc_q = in_q.q + weights_q.q calc_int_bits = calc_width - calc_q c_q = acc_q = QType(bits=calc_width, q=calc_q, signed=True) if conv_active: o_q = c_q if pow2_biases == 0: biases_dtype = params_dtype elif pow2_biases == 8: biases_dtype = np.int8 elif pow2_biases == 16: biases_dtype = np.int16 else: biases_dtype = np.int32 biases_q = QType.from_array_pow2(arr=biases_node.dqvalue, dtype=biases_dtype) # make sure that the biases are not stored more precisily than the accumulator. It's pointless and will # cause a negative shift if biases_q.q > acc_q.q: biases_q.q = acc_q.q if isinstance(params, MultiplicativeBiasParameters) and params.has_mul_bias: mb_q = QType.from_array_pow2(arr=params.mul_biases, dtype=int_dtype) else: mb_q = None return QRec.symmetric(in_qs=[in_q, weights_q, biases_q], out_qs=[o_q], calc_q=c_q, acc_q=acc_q, mul_biases_q=mb_q)
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] in_qs = cls.force_symmetric_and_dtype(in_qs, idx=0, dtype=np.int16) if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] opts = kwargs.get('opts', {}) cls.check_valid_ranges(params, stats, idx=0, dirs='out') names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } o_q = in_qs[names['i_state']] = QType.from_min_max_sq( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=np.int16) if force_out_q: if force_out_q.zero_point != 0: return None LOG.warning( 'on node %s output is being forced from scale %s -> %s', params.name, o_q.scale, force_out_qs[0].scale) o_q = force_out_qs[0] cell_range = stats.get('range_cell') if cell_range is None: raise ValueError( f'cell range not present in stats for {params.name}') # cell range in minimum 1.0 cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']]) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): msg = ( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) LOG.warning('%s', msg) else: cell_max = cell_stat # this limit is driven by the c_in * f + c * i calculation # c * i will be in Q24 and we want c_in * f to be scaled to the same # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6 cell_limit = pow(2, 6) if cell_max > cell_limit: LOG.warning('Cell state exceeds %s and will be clipped', cell_limit) cell_max = cell_limit cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']] = QType.from_min_max_sq(-cell_max, cell_max, dtype=np.int16) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # set weight qtypes edges = kwargs['G'].indexed_in_edges(params.name) scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } for scale_pair in scale_pairs.values(): in_q = in_qs[names[scale_pair[0]]] in_qs[names[scale_pair[0]]] = QType.from_min_max_sq( in_q.min_val, in_q.max_val, dtype=np.int8, narrow_range=opts.get('narrow_weights'), dont_generate_value=True) in_qs[names[scale_pair[0]]].bits = opts['weight_bits'] in_q = in_qs[names[scale_pair[1]]] in_qs[names[scale_pair[1]]] = QType.from_min_max_sq( in_q.min_val, in_q.max_val, dtype=np.int8, narrow_range=opts.get('narrow_weights'), concatenated_nodes=[ edges[names[scale_pair[0]]].from_node.name ]) in_qs[names[scale_pair[1]]].bits = opts['weight_bits'] # get weight scales w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] gate_sum_max = [(get_max(stats[f'range_{gate}_gate_i']), get_max(stats[f'range_{gate}_gate_r'])) for gate in ['i', 'o', 'c', 'f']] gate_sum_max_bits = [ (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))), np.ceil(np.log2(gsm_r / (o_q.scale * r_w)))) for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales) ] for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'], gate_sum_max_bits): if max_i > 30: LOG.warning( 'max bits in accumulation input %s gate %s - there may be errors', max_i, gate) if max_r > 30: LOG.warning( 'max bits in accumulation state %s gate %s - there may be errors', max_i, gate) # LUT activations Q12 -> Q15 act_in_q = 12 act_out_q = 15 int_scale = math.pow(2, -act_in_q) out_tanh_sig_scale = math.pow(2, -act_out_q) scale_qtypes = {} r_pscales = {} i_pscales = {} scale_qtypes['r_pscales'] = r_pscales scale_qtypes['i_pscales'] = i_pscales for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales, gate_sum_max_bits): weight_scale_ratio = w_scale[0] / w_scale[1] # TODO - decide to scale weights equal in_qs[names[f"{gate}_b"]] = QType(scale=int_scale, dtype=np.int32) i_pscales[gate] = w_scale[0] * in_qs[0].scale scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / int_scale) qscale.pre_normalization = int(max(8 - (31 - max_bits[0]), 0)) r_pscales[gate] = w_scale[1] * o_q.scale scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=r_pscales[gate] / int_scale) qscale.pre_normalization = int(max(8 - (31 - max_bits[1]), 0)) r_pscales['state_out_scale'] = o_q.scale r_pscales['int_scale'] = int_scale # ct = c_in * f + c * i # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12 # scale(c_in * f) = Q15 * Q15 prenorm 8 and scale -> Q12 # ((c_in * f) + (c * i)) in Q12 # scale -> cell_out # tan(ct) -> Q15 # o * tan(ct) -> Q30 # prenorm and scale # cell in to Q12 cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale / int_scale) # cell_out from Q12 cell_out_scale = int_scale / in_qs[names['c_state']].scale # state out from Q30 state_out_scale = math.pow(2, -(2 * act_out_q)) / o_q.scale r_pscales['act_out_scale'] = out_tanh_sig_scale r_pscales['c_before_scale'] = int_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # for 16 bit pre-normalize the scales to give us room scale_qtypes['cell_in_q'].pre_normalization = 8 scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) scale_qtypes['state_out_q'].pre_normalization = 8 scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32) if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, **scale_qtypes, )
def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs): qrecs = kwargs['qrecs'] G = kwargs['G'] o_q = SymmetricMultQType.from_min_max( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) input_nodes = { LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(params.name) if isinstance(edge.from_node, ConstantInputParameters) } names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } if params.cell_clip: cell_max = params.cell_clip else: cell_max = max( abs(stats['range_cell'][var]) for var in ['min', 'max']) cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) int2_scale = int3_scale = out_tanh_sig_scale = None if params.hard_act: # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((16 - cell_int_bits), 10) int2_scale = math.pow(2, -(int_q * 2)) int3_scale = math.pow(2, -(int_q * 3)) else: int_q = 12 out_tanh_sig_scale = math.pow( 2, -15) # output of LUT activations are always Q15 int_scale = math.pow(2, -int_q) if np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.info( "node %s has similar input and i_state scales --> " "will be generated the same_scale kernel with better performances", params.name) params.rnn_same_inout_scale = True G.node_options[NodeId(params)] = params.at_options if params.rnn_same_inout_scale: if not np.isclose(in_qs[0].scale, o_q.scale, atol=1e-2): LOG.warning( "node %s has different input and i_state scales consider using the " "LSTM kernel with rnn_same_inout_scale=False (better accuracy)", params.name) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) i_state_scale = in_scale = np.maximum( in_and_out_scale, in_qs[names['i_state']].scale) in_qs[0].scale = in_scale o_q.scale = in_scale cls.rescale_constant(input_nodes['i_state'], i_state_scale, qrecs) else: in_scale = in_qs[0].scale i_state_scale = np.maximum(o_q.scale, in_qs[names['i_state']].scale) o_q.scale = i_state_scale scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } scales = { k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items() } for k, (namei, namer) in scale_pairs.items(): cls.rescale_constant(input_nodes[namei], scales[k], qrecs) cls.rescale_constant(input_nodes[namer], scales[k], qrecs) # compute scales for perceptrons pscales = {k: scales[k] * i_state_scale for k in ['i', 'o', 'c', 'f']} scale_qtypes = { "r_2_%s_q" % k: MultMulBiasScaleQType(scale=pscale / int_scale) for k, pscale in pscales.items() } # if input and i_state have different scales -> scale the inputs before sum # otherwise do nothing and these scales will be ignored scale_qtypes.update({ "i_2_%s_q" % k: MultMulBiasScaleQType(scale=in_scale / i_state_scale) for k in ['i', 'o', 'c', 'f'] }) if params.hard_act: cell_in_scale = in_qs[names['c_state']].scale / int_scale cell_out_scale = int2_scale / in_qs[names['c_state']].scale state_out_scale = int3_scale / i_state_scale else: cell_in_scale = in_qs[ names['c_state']].scale * out_tanh_sig_scale / int_scale cell_out_scale = int_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / i_state_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for gate in ['i', 'o', 'c', 'f']: cls.rescale_constant(input_nodes["%s_b" % gate], pscales[gate], qrecs, dtype=np.int32) return MultScalableLstmQuantizationRecord( in_qs=in_qs, out_qs=[o_q], **scale_qtypes, )
def _quantize_lstm(cls, params, in_qs, stats, input_bits, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_qs and any(force_out_q is not None for force_out_q in force_out_qs): return None opts = kwargs.get('opts', {}) if input_bits == 16: in_out_dtype = np.uint16 else: in_out_dtype = np.uint8 if in_qs is None: return None in_qs = deepcopy(in_qs) G = kwargs['G'] in_q = in_qs[0] cls.check_valid_ranges(params, stats, idx=0, dirs='out') in_edges = G.indexed_in_edges(params.name) names = { val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES) } o_q = in_qs[names['i_state']] = QType.from_min_max_sq( min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=in_out_dtype, narrow_range=opts['narrow_state']) cell_range = stats.get('range_cell') if cell_range is None: raise ValueError( f'cell range not present in stats for {params.name}') # cell range in minimum 1.0 cell_stat = max(1.0, *[abs(cell_range[var]) for var in ['min', 'max']]) if params.cell_clip and not params.quant_c_state_with_stat: cell_max = params.cell_clip ratio_c = cell_max / cell_stat if not (ratio_c > 0.9 and ratio_c < 1.1): msg = ( f"C state is forced to a range [-{cell_max}:{cell_max}] different to the one calulated " f"from the inference statistic [-{cell_stat}:{cell_stat}], consider using nodeoption {params.name} " "QUANT_C_STATE_WITH_STAT 1 to force it to be the one calculated" ) LOG.warning('%s', msg) else: cell_max = cell_stat # this limit is driven by the c_in * f + c * i calculation # c * i will be in Q24 and we want c_in * f to be scaled to the same # abs(f) will be <=1 so the cell int bits cannot exceed 31 - 1 (overflow) - 24 = 6 cell_limit = pow(2, 6) if cell_max > cell_limit: LOG.warning('Cell state exceeds %s and will be clipped', cell_limit) cell_max = cell_limit cell_int_bits = calc_bits(cell_max) # cell stays signed since it is used in a haddamard with the int32 streamout # in NE16 in_qs[names['c_state']] = QType.from_min_max_sq( -cell_max, cell_max, dtype=np.int16 if input_bits == 16 else np.int8) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # set weight qtypes int_num_inp = roundup(params.n_inputs, input_bits == 16) int_num_states = roundup(params.n_states, input_bits == 16) woffs = {} in_q = limit_input_precision(params, input_bits, in_q, int_num_inp, opts['narrow_weights'], opts['weight_bits']) o_q = limit_input_precision( params, input_bits, o_q, int_num_states, opts['narrow_weights'], opts['weight_bits'], extra_correction=-1 if opts.get('narrow_state') else 0) for gate in ['i', 'o', 'c', 'f']: i_idx = names[f'i_2_{gate}_w'] r_idx = names[f'r_2_{gate}_w'] woffs[gate] = woff_gate = [None, None] woff_gate[0] = calculatate_weight_q( in_qs, in_edges, i_idx, in_q.zero_point[0], (params.n_states, params.n_inputs), (params.n_states, int_num_inp), opts['weight_bits'], opts.get('narrow_weights')) woff_gate[1] = calculatate_weight_q( in_qs, in_edges, r_idx, o_q.zero_point[0], (params.n_states, params.n_states), (params.n_states, int_num_states), opts['weight_bits'], opts.get('narrow_weights')) # get weight scales scale_pairs = { chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f'] } w_scales = [(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()] gate_sum_max = [(get_max_or_one(stats[f'range_{gate}_gate_i']), get_max_or_one(stats[f'range_{gate}_gate_r'])) for gate in ['i', 'o', 'c', 'f']] gate_sum_max_bits = [ (np.ceil(np.log2(gsm_i / (in_qs[0].scale * i_w))), np.ceil(np.log2(gsm_r / (o_q.scale * r_w)))) for (gsm_i, gsm_r), (i_w, r_w) in zip(gate_sum_max, w_scales) ] for gate, (max_i, max_r) in zip(['i', 'o', 'c', 'f'], gate_sum_max_bits): if np.max(max_i) > 30: LOG.warning( 'max bits in accumulation input %s gate %s - there may be errors', max_i, gate) if np.max(max_r) > 30: LOG.warning( 'max bits in accumulation state %s gate %s - there may be errors', max_i, gate) # LUT activations Q12 -> Q15 act_in_q = 12 act_out_q = 15 int_scale = math.pow(2, -act_in_q) out_tanh_sig_scale = math.pow(2, -act_out_q) scale_qtypes = {} r_pscales = {} i_pscales = {} scale_qtypes['r_pscales'] = r_pscales scale_qtypes['i_pscales'] = i_pscales for gate, w_scale, max_bits in zip(['i', 'o', 'c', 'f'], w_scales, gate_sum_max_bits): weight_scale_ratio = w_scale[0] / w_scale[1] # TODO - decide to scale weights equal i_pscales[gate] = w_scale[0] * in_q.scale r_pscales[gate] = w_scale[1] * o_q.scale if input_bits == 16: scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / int_scale) else: scale_qtypes[f"i_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=i_pscales[gate] / r_pscales[gate]) if input_bits == 16: i_zp_b = woffs[gate][0] else: i_zp_b = woffs[gate][0] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) scale_qtypes[f"r_2_{gate}_q"] = qscale = MultMulBiasScaleQType( scale=r_pscales[gate] / int_scale) if input_bits == 16: r_zp_b = woffs[gate][1] in_qs[names[f'{gate}_b']] = QType(dtype=np.int32, scale=r_pscales[gate], offset=r_zp_b, interleaved_values=[i_zp_b]) else: r_zp_b = woffs[gate][1] * qscale.qbiases.astype(np.int32) + ( 1 << (qscale.qnorms.astype(np.int32) - 1)) in_qs[names[f'{gate}_b']] = QType(dtype=np.int32, scale=r_pscales[gate] / qscale.qbiases, offset=r_zp_b, interleaved_values=[i_zp_b]) # NOTE - for 16 bit pre-normalize the scales to give us room but make sure it isn't negative if input_bits == 16: gate_prenorm = min( np.min([ np.min(scale_qtypes[f"{inp}_2_{gate}_q"].qnorms) for gate in ['i', 'o', 'c', 'f'] for inp in ['i', 'r'] ]), 8) for gate in ['i', 'o', 'c', 'f']: for inp in ['i', 'r']: scale_qtypes[ f"{inp}_2_{gate}_q"].pre_normalization = gate_prenorm else: gate_prenorm = 0 r_pscales['state_out_scale'] = o_q.scale r_pscales['int_scale'] = int_scale # ct = c_in * f + c * i # c * i = Q15 * Q15 -> Q30 -> norm(18) -> Q12 # scale(c_in * f) = Qcell * Q15 (prenorm if 16bit) and scale -> Q12 # ((c_in * f) + (c * i)) in Q12 # scale -> cell_out # tan(ct) -> Q15 # o * tan(ct) -> Q30 # prenorm and scale # scale result of c_state_1 * f_gate -> Q15 cell_in_scale = (in_qs[names['c_state']].scale * out_tanh_sig_scale / out_tanh_sig_scale) # cell_out from Q15 -> Q7/Q15 scaled cell_out_scale = out_tanh_sig_scale / in_qs[names['c_state']].scale state_out_scale = out_tanh_sig_scale / o_q.scale r_pscales['act_out_scale'] = out_tanh_sig_scale r_pscales['c_before_scale'] = int_scale scale_qtypes['cell_in_q'] = MultMulBiasScaleQType(scale=cell_in_scale) # NOTE - for 16 bit pre-normalize the scales to give us room if input_bits == 16: scale_qtypes['cell_in_q'].pre_normalization = 8 scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=cell_out_scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType( scale=state_out_scale) scale_qtypes['i_qtype'] = QType(q=act_in_q, dtype=np.int32) if params.lstm_output_c_state: out_qs = [o_q, in_qs[names['c_state']]] else: out_qs = [o_q] return QRec.scaled( in_qs=in_qs, out_qs=out_qs, ne16=True, gate_prenorm=gate_prenorm, **scale_qtypes, )