def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] backwards = kwargs.get('backwards') # if we are going backwards if backwards: # if output must be forced assert force_out_q, f'going backwards at {params.name} but output is not forced' return MultQuantizationRecord(in_qs=[force_out_qs[0]] * len(in_qs), out_qs=[deepcopy(force_out_qs[0])]) # if going forwards and our output is forced and does not match input then # we cannot satisfy if force_out_q and not all(in_q == force_out_q for in_q in in_qs): return None # if all the inputs are the same qtype then we output that qtype if all(in_qs[0] == in_q for in_q in in_qs[1::]): return MultQuantizationRecord(in_qs=in_qs, out_qs=[deepcopy(in_qs[0])]) # our output cannot be forced at this point # if all the inputs are not the same then force all of them to the maximum input size with a Q that # fits the most int bits max_scale_idx = max([(idx, in_q.scale) for idx, in_q in enumerate(in_qs)], key=lambda x: x[1])[0] max_scale_q = in_qs[max_scale_idx] return MultQuantizationRecord(in_qs=[max_scale_q] * len(in_qs), out_qs=[deepcopy(max_scale_q)])
def _quantize(cls, params, in_qs, stats, **kwargs): o_q = in_qs[0] force_out_qs, _ = cls.get_mult_opts(**kwargs) first_forced_q = force_out_qs and next( iter(out_q for out_q in force_out_qs if out_q is not None), None) if first_forced_q and not all(out_q == first_forced_q for out_q in force_out_qs if out_q is not None): LOG.error( 'split %s is being forced to have different output qtypes', params.name) return None if first_forced_q: backwards = kwargs.get('backwards', None) if backwards: # if going backwards and forced then we force our input return MultQuantizationRecord( in_qs=[first_forced_q], out_qs=[ deepcopy(first_forced_q) for _ in range(params.num_splits) ]) elif o_q != first_forced_q: LOG.error( 'split %s is being forced to have different output to input', params.name) return None # continue here if forced since o_q == forced_q return MultQuantizationRecord( in_qs=in_qs, out_qs=[deepcopy(o_q) for _ in range(params.num_splits)])
def common_quantize(cls, in_qtype, out_qtype, node, **kwargs): all_nodes = kwargs['all_nodes'] opts = kwargs['opts'] G = kwargs['G'] inputs = [all_nodes[t] for t in node.input] x = inputs[0] if cls.is_constant(x): LOG.info("reducing %s to a constant", node.name) if out_qtype: val = x[0].value_as(out_qtype) else: val = cls.get_constant(x) params = ConstantInputParameters(node.name, value=val, dims=Dim.unnamed(val.shape), qtype=out_qtype, constant_store=G.constant_store) if opts.get('load_quantization'): G.quantization[NodeId(params)] = MultQuantizationRecord( in_qs=[out_qtype], out_qs=[out_qtype]) else: params = QuantizeParameters(node.name, from_qtype=in_qtype, to_qtype=out_qtype) G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) if opts.get('load_quantization'): G.quantization[NodeId(params)] = MultQuantizationRecord( in_qs=[in_qtype], out_qs=[out_qtype]) all_nodes[node.output[0]] = (params, 0, deepcopy(x[2])) return params
def match(self, G: GraphView, set_identity: bool = True): if not G.quantization: return for nid in [nid for nid, qrec in G.quantization.sorted_iterator(G) if qrec is None or not (qrec.in_qs and qrec.out_qs)]: if nid.fnode_name: LOG.warning("can't add quantization to fused node %s", nid.fnode_name) continue if nid.node_name not in G: # previous fusions may have removed nodes from the graph continue node = nid.get_node(G) predecessors = [NodeId(pred) for pred in G.predecessors(node.name)] successors = [NodeId(succ) for succs in G.successors(node.name) for succ in succs] go_back = not successors or (predecessors and all(pred in G.quantization for pred in predecessors)) go_forward = not predecessors or (successors and all(succ in G.quantization for succ in successors)) if not (go_back or go_forward): LOG.warning("node %s is not connected to anything and has no quantization", node.name) continue if go_forward: out_qrecs = set(G.quantization[nid] for nid in successors) if not all(isinstance(out_qrec, MultQuantizationRecord) for out_qrec in out_qrecs): continue out_qtypes = reduce_qtypes([(edge.from_idx, G.quantization[NodeId(edge.to_node)].in_qs[edge.to_idx]) for edge in G.out_edges(node.name)]) else: out_qtypes = None if go_back: in_qrecs = set(G.quantization[nid] for nid in predecessors) if not all(isinstance(in_qrec, MultQuantizationRecord) for in_qrec in in_qrecs): continue in_qtypes = reduce_qtypes([(edge.to_idx, G.quantization[NodeId(edge.from_node)].out_qs[edge.from_idx]) for edge in G.in_edges(node.name)]) else: in_qtypes = None if not in_qtypes: if not predecessors: LOG.info("setting quantization on input node %s", node.name) qrec = MultQuantizationRecord(in_qs=deepcopy(out_qtypes), out_qs=deepcopy(out_qtypes)) else: raise NotImplementedError("propagating qrecs not implemented") elif not out_qtypes: if not successors: LOG.info("setting quantization on output node %s", node.name) qrec = MultQuantizationRecord(in_qs=deepcopy(in_qtypes), out_qs=deepcopy(in_qtypes)) else: raise NotImplementedError("propagating qrecs not implemented") else: LOG.info("setting quantization on node %s", node.name) qrec = MultQuantizationRecord(in_qs=deepcopy(in_qtypes), out_qs=deepcopy(out_qtypes)) G.quantization[nid] = qrec if set_identity: self.set_identity(G) return False
def hsigmoid_mult(params, in_tensors, qrec: MultQuantizationRecord, details=None): del details in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] fac_1, upper_bound, lower_bound = hsigmoid_mult_gen_factors(params, qrec) in_tensor = in_tensor.astype(np.int32) in_tensor_relued = np.minimum(np.maximum(in_tensor + fac_1, lower_bound), upper_bound) in_tensor = qrec.scale_mul_biases_q.apply_scales(in_tensor_relued) return qrec.get_outputs(params, [in_tensor], ktype="symmetric")
def replace_function(self, G: GraphView, subgraph: GraphView): if not self.validate_match(subgraph): raise DontReplaceError() step = 0 for node in subgraph.nodes(): node.step_idx = step step = step + 1 if isinstance(node, Conv2DParameters): conv_name = node.name + "_fusion" break LOG.debug("fused nodes %s", ",".join((node.name for node in subgraph.nodes()))) # simple node order is necessary because nodes() will not necessarily # be in order pnode = ConvFusionParameters(conv_name, fusion_type=self.fusion_type, subgraph=subgraph) if G.quantization: qrecs = G.quantization.get_all(pnode.contained_nodes()) if qrecs: if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in pnode.contained_nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec return pnode, None, None
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if isinstance( params, (HSwishActivationParameters, HSigmoidActivationParameters)): in_q = in_qs[0] max_val = in_q.scale * pow(2, in_q.bits - 1) if max_val < 6: in_qs = [QType.from_min_max_sq(-6, 6, dtype=in_q.dtype)] if force_out_q: fusion = kwargs.get('fusion', None) if fusion and fusion.fusion_type in [ 'conv_active_pool', 'conv_active' ]: if not isinstance( params, (SigmoidActivationParameters, TanHActivationParameters, HSwishActivationParameters, HSigmoidActivationParameters)): in_qs = [deepcopy(force_out_q)] o_q = deepcopy(force_out_q) else: o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], stats['range_out'][0]['max'], dtype=out_dtype) return MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
def replace_function(self, G: GraphView, subgraph: GraphView): relu_node = None constant_node = None mul_node = None for node in subgraph.nodes(): if isinstance(node, ReluActivationParameters): relu_node = node elif isinstance(node, ConstantInputParameters): constant_node = node elif isinstance(node, MatrixMulParameters): mul_node = node activation = HSigmoidActivationParameters(mul_node.name + "_fused_close_hsigmoid", offset=0) if G.quantization: reluqrec = G.quantization[NodeId(relu_node)] mulqrec = G.quantization[NodeId(mul_node)] del G.quantization[NodeId(constant_node)] if isinstance(reluqrec, (SymmetricQuantizationRecord)): pqrec = SymmetricQuantizationRecord(in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) elif isinstance(reluqrec, (MultQuantizationRecord)): pqrec = MultQuantizationRecord(in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) elif isinstance(reluqrec, (Float32QuantizationRecord)): pqrec = Float32QuantizationRecord(in_qs=reluqrec.in_qs, out_qs=mulqrec.out_qs) else: raise NotImplementedError() G.quantization[NodeId(activation)] = pqrec return activation, None, None
def _import_nodes(self, G, graph, handlers, all_nodes, outputs, opts): for node in graph.nodes: handler = handlers.get(node.op_name, None) if not handler: raise ValueError("no handler found for %s" % node.op_type) if node.is_custom and handler: handler = handler.get(node.custom_op_name, None) if not handler: raise ValueError("no handler found for custom operation %s" % node.custom_op_name) params = handler.handle(node, all_nodes=all_nodes, G=G, opts=opts, importer=self) if params is None: continue for idx, out_tensor in enumerate(node.output): output = outputs.get(out_tensor) if not output: continue G.add_edge(NNEdge(from_node=params, to_node=output[0], from_idx=idx, to_idx=output[1])) if opts.get('load_quantization'): qtype = deepcopy(G.quantization[NodeId(params)].out_qs[idx]) G.quantization[NodeId(output[0])] = MultQuantizationRecord( in_qs=[qtype], out_qs=[qtype] )
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_q: return None o_q = QType.from_min_max_sq(min_val=-1.0, max_val=1.0, dtype=out_dtype) return MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
def replace_function(self, G: NNGraph, subgraph: GraphView): step = 0 for node in subgraph.nodes(): node.step_idx = step step = step + 1 if isinstance(node, FcParameters): linear_name = node.name + "_fusion" break LOG.info("fusing nodes %s", ",".join( (node.name for node in subgraph.nodes()))) # simple node order is necessary because nodes() will not necessarily # be in order pnode = ConvFusionParameters(linear_name, fusion_type="linear_active", subgraph=subgraph) if G.quantization: qrecs = G.quantization.get_all(pnode.contained_nodes()) if qrecs: if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in pnode.contained_nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec return pnode, None, None
def replace_function(self, G: NNGraph, subgraph: GraphView): nodes = list(subgraph.nodes()) pnode = ActivationFusion(nodes[0].name + "fusion", nodes[0].op_name + "_active", subgraph) nodes[0].step_idx = 0 nodes[1].step_idx = 1 LOG.debug("fused nodes %s", ",".join((node.name for node in nodes))) if G.quantization: qrecs = G.quantization.get_all(subgraph.nodes()) if qrecs: if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in subgraph.nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec return pnode
def quantize_fusion(self, G, node, in_qs, dtype): fin_qs = in_qs nodes = node.contained_nodes() if node.fusion_type in ['conv_active_pool', 'conv_active']: conv_node = nodes[0] act_node = nodes[1] act_astats = self._activation_stats.get(NodeId(node, act_node)) conv_qrec = self.calculate_q(G, conv_node, act_astats, fin_qs, dtype, out_dtype=np.int8) self.qrecs[NodeId(node, conv_node)] = conv_qrec fin_qs = conv_qrec.out_qs nodes = nodes[1:] for fnode in nodes: qrec = self.calculate_q(G, fnode, self._activation_stats.get(NodeId(node, fnode)), fin_qs, dtype) self.qrecs[NodeId(node, fnode)] = qrec fin_qs = qrec.out_qs return MultQuantizationRecord(in_qs=in_qs, out_qs=fin_qs)
def match(self, G: GraphView, set_identity: bool = True): has_modified_graph = False for conv_node in [params for params in G.nodes() if isinstance(params, Conv2DParameters)]: node_list = self.get_node_list(G, conv_node) if node_list is None or len(node_list.order) < 2: continue if node_list.fusion_type == 'conv_active_pool': if node_list.pool.pool_type == "average": node_list.order = node_list.order[:2:] node_list.pool = None elif node_list.fusion_type == 'conv_pool_active': if node_list.pool.pool_type == "average" and node_list.active.activation != "relu": continue LOG.info("fusing nodes %s", ",".join((node.name for node in node_list.order))) has_modified_graph = True subgraph = GraphView() last_node = None for node in node_list.order: if last_node is not None: subgraph.add_edge(NNEdge(from_node=last_node, to_node=node)) last_node = node input_mapping = [[(node_list.conv, idx)] for idx in range(3)] output_mapping = [(last_node, 0)] pnode = ConvFusionParameters( node_list.conv.name + '_fusion', fusion_type=node_list.fusion_type, subgraph=subgraph, in_dims_hint=node_list.conv.in_dims_hint, out_dims_hint=node_list.conv.out_dims_hint, input_mapping=input_mapping, output_mapping=output_mapping) if G.quantization: qrecs = G.quantization.get_all(pnode.contained_nodes()) if qrecs: prec = None if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in pnode.contained_nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec in_edges = G.in_edges(node_list.conv.name) out_edges = G.out_edges(last_node.name) for node in node_list.order: G.remove(node) for edge in in_edges: G.add_edge(NNEdge(edge.from_node, pnode, from_idx=edge.from_idx, to_idx=edge.to_idx)) for edge in out_edges: G.add_edge(NNEdge(pnode, edge.to_node, from_idx=edge.from_idx, to_idx=edge.to_idx)) if set_identity: self.set_identity(G) return has_modified_graph
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] backwards = kwargs.get('backwards') if backwards: # if output must be forced assert force_out_q, f'going backwards at {params.name} but output is not forced' return MultQuantizationRecord(in_qs=[deepcopy(force_out_q)] * len(in_qs), out_qs=[deepcopy(force_out_q)]) # if going forwards and our output is forced and does not match input then # we cannot satisfy if force_out_q and not all(in_q == force_out_q for in_q in in_qs): return None return MultQuantizationRecord(in_qs=in_qs, out_qs=[deepcopy(in_qs[0])])
def average_execute_mult(cls, params, in_tensors, qrec: MultQuantizationRecord): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] out_dims = params.out_dims[0] qrec.set_scale(in_idx=0, out_idx=0) sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=tuple( params.axis), keepdims=params.keep_dims) sz = reduce(lambda x, y: x * y, [i for idx, i in enumerate(in_tensor.shape) if idx in params.axis]) res = at_norm(((sum_by_chan << 7) / sz).astype(np.int32), 7) res = out_tensor = qrec.scale_mul_biases_q.apply_scales(res) return qrec.get_outputs(params, [out_tensor.reshape(out_dims.shape)], ktype="symmetric")
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_q: return None # force the input to be POW2 scaled in_q = deepcopy(in_qs[0]) # in_q.scale_to_pow2() o_q = QType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) return MultQuantizationRecord(in_qs=[in_q], out_qs=[o_q])
def average_execute(cls, params, in_tensors, qrec: MultQuantizationRecord): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] out_dims = params.out_dims[0] sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=tuple( params.axis), keepdims=params.keep_dims) norm = (np.array([31], dtype=np.int32) - gap_clb(sum_by_chan.flatten())).astype(np.int32) sz = reduce(lambda x, y: x * y, [i for idx, i in enumerate(in_tensor.shape) if idx in params.axis]) inv_wh = ((1 << norm) // sz).reshape(sum_by_chan.shape) out_tensor = at_norm((inv_wh * sum_by_chan), norm.reshape(sum_by_chan.shape)) return qrec.get_outputs(params, [qrec.out_qs[0].clip(out_tensor).reshape(out_dims.shape)], ktype="symmetric")
def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): del G if out_dtype is None: out_dtype = dtype if isinstance(node, (PoolingParameters, OutputParameters)): o_q = in_qs[0] elif isinstance(node, SoftMaxParameters): o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) else: o_q = SymmetricMultQType.from_min_max(min_val=astats['min'], max_val=astats['max'], dtype=out_dtype) if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, ConstantInputParameters): qrec = MultConstantQuantizationRecord(out_qs=[o_q], constants_are_quantized=False) elif isinstance(node, (FcParameters, Conv2DParameters)): weights_q = SymmetricMultQType.from_array(arr=node.weights, quantized_dimension=self.get_quantized_dimension(node), dtype=dtype, narrow_range=self._narrow_weights) if node.has_bias: biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) else: biases_q = SymmetricMultBiasesQType(dtype=np.int32, scale=np.array([1], dtype=np.int32)) mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], out_qs=[o_q], weights_q=weights_q, biases_q=biases_q, mul_biases_q=mul_biases_q, constants_are_quantized=False) LOG.debug("filter %s qrec %s", node.name, qrec) else: qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) return qrec
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_q: o_q = deepcopy(force_out_q) else: o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], stats['range_out'][0]['max'], dtype=out_dtype) return MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
def av_global_pool_mult(params, in_tensors, qrec: MultQuantizationRecord, details=None): # Prepare the quantization levels in_tensor = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")[0] in_dims = params.in_dims[0] out_dims = params.out_dims[0] qrec.set_scale(in_idx=0, out_idx=0) sum_by_chan = np.sum(in_tensor, dtype=np.int32, axis=(in_dims.get_order_idx('w'), in_dims.get_order_idx('h'))) res = at_norm((sum_by_chan << 7) // (in_dims.h * in_dims.w), 7) res = out_tensor = qrec.scale_mul_biases_q.apply_scales(res) return qrec.get_outputs(params, [out_tensor.reshape(out_dims.shape)], ktype="symmetric")
def set_c_state_as_output(self, G): output_c_state = G.add_output() lstm_qrec = G.quantization and G.quantization.get(NodeId(self)) if lstm_qrec: c_state_idx = self.INPUT_NAMES.index('c_state') in_q = lstm_qrec.in_qs[c_state_idx] lstm_qrec.out_qs.append(in_q) c_state_q = MultQuantizationRecord(in_qs=[in_q], out_qs=[in_q]) G.quantization[NodeId(output_c_state)] = c_state_q G.add_edge(NNEdge(self, output_c_state, from_idx=1)) G.add_dimensions()
def piecewise_mult(params, in_tensors, qrec: MultQuantizationRecord, details=None): del details in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") func = PIECEWISE_OPS[params.__class__] op = func['op'] if func['is_mult']: qrec.set_scale(in_idx=(0, 1), out_idx=0) i1 = in_tensors[0].astype(np.int32) i2 = in_tensors[1].astype(np.int32) res = qrec.scale_mul_biases_q.apply_scales(op(i1, i2, np.int32)) else: # larger scale should be scaled qrec.set_add_scale() if qrec.scaled_idx: i1 = in_tensors[0].astype(np.int32) i2 = qrec.scale_in_mul_biases_q.apply_scales(in_tensors[1]) else: i1 = qrec.scale_in_mul_biases_q.apply_scales(in_tensors[0]) i2 = in_tensors[1].astype(np.int32) res = qrec.scale_mul_biases_q.apply_scales(op(i1, i2, None)) return qrec.get_outputs(params, [qrec.out_qs[0].clip(res)], ktype="symmetric")
def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_q: return None out_dtype = params.output_dtype in_dtype = params.input_dtype in_q = QType(scale=1, dtype=in_dtype) out_q = QType.from_min_max_sq(-1, 1, dtype=out_dtype, narrow_range=True) return MultQuantizationRecord(in_qs=[in_q], out_qs=[out_q])
def quantize_forward_fusion(self, pparams, in_qs, out_dtype, **kwargs): fin_qs = in_qs for fparams in pparams.contained_nodes(): handler = self.handlers[0].get(fparams.__class__, self.handlers[0]['__default__']) stats = kwargs['all_stats'].get(NodeId(pparams, fparams)) qrec = self.handle(handler, fparams, fin_qs, out_dtype, stats=stats, fusion=pparams, **kwargs) assert qrec, "handler did not return a result" self.qrecs[NodeId(pparams, fparams)] = qrec fin_qs = qrec.out_qs return MultQuantizationRecord(in_qs=in_qs, out_qs=fin_qs)
def fuse_activation(cls, tfl_opts, name, params, **kwargs): G = kwargs['G'] opts = kwargs['opts'] if opts.get('load_quantization') and NodeId(params) in G.quantization: node_qrec = G.quantization[NodeId(params)] else: node_qrec = None # if node_qrec is not None and None in node_qrec.in_qs + node_qrec.out_qs: # # one of the input is a constant or strange behaviour -> may be is something fusions will get rid of # return add_node(self.G, node) aparams = None if tfl_opts.FusedActivationFunction() == ActivationFunctionType.NONE: if node_qrec is not None and isinstance( node_qrec, MultQuantizationRecordBase): # here we have no activation in an asymmetric qtype -> may be an omitted relu if node_qrec.out_qs[0].min_val == 0: if np.all(np.round(node_qrec.out_qs[0].max_val) == 6): aparams = ActivationParameters.get_activation( 'relu6', name + "_activation") else: aparams = ActivationParameters.get_activation( 'relu', name + "_activation") else: aparams = ActivationParameters.get_activation( cls.TF_ACTIVATIONS[tfl_opts.FusedActivationFunction()], name + "_activation") if aparams: G.add_edge(NNEdge(from_node=params, to_node=aparams)) if opts.get('load_quantization'): # In between the fused operation and activation the # transfer is in int32 representation node_qrec = G.quantization[NodeId(params)] ina_qtype = deepcopy(node_qrec.out_qs[0]) outa_qtype = deepcopy(ina_qtype) G.quantization[NodeId(aparams)] = MultQuantizationRecord( in_qs=[ina_qtype], out_qs=[outa_qtype]) params = aparams return params
def match(self, G: GraphView, set_identity: bool = True): has_modified_graph = False for pad_node in [ params for params in G.nodes() if isinstance(params, PadParameters) ]: node_list = self.get_node_list(G, pad_node) if node_list is None or len(node_list.order) < 2: continue LOG.info("fusing nodes %s", ",".join( (node.name for node in node_list.order))) has_modified_graph = True subgraph = GraphView() padded_input_idx = G.out_edges(node_list.pad.name)[0].to_idx subgraph.add_edge( NNEdge(from_node=node_list.pad, to_node=node_list.add, to_idx=padded_input_idx)) last_node = node_list.add node_list.add.force_quantized_index = 0 if node_list.active: subgraph.add_edge( NNEdge(from_node=node_list.add, to_node=node_list.active)) last_node = node_list.active if padded_input_idx == 0: input_mapping = [[(node_list.pad, 0)], [(node_list.add, 1)]] else: input_mapping = [[(node_list.add, 0)], [(node_list.pad, 1)]] output_mapping = [(last_node, 0)] pnode = PaddedAddFusionParameters( "PADDED_" + node_list.add.name, fusion_type=node_list.fusion_type, subgraph=subgraph, input_mapping=input_mapping, output_mapping=output_mapping) if G.quantization: qrecs = G.quantization.get_all(pnode.contained_nodes()) if qrecs: prec = None if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in pnode.contained_nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec if padded_input_idx == 0: in_edges = G.in_edges(node_list.pad.name) + G.indexed_in_edges( node_list.add.name)[1::] else: in_edges = G.indexed_in_edges( node_list.add.name)[0:1:] + G.in_edges(node_list.pad.name) out_edges = G.out_edges(last_node.name) for node in node_list.order: G.remove(node) for edge in in_edges: G.add_edge( NNEdge(edge.from_node, pnode, from_idx=edge.from_idx, to_idx=edge.to_idx)) for edge in out_edges: G.add_edge( NNEdge(pnode, edge.to_node, from_idx=edge.from_idx, to_idx=edge.to_idx)) if set_identity: self.set_identity(G) return has_modified_graph
def match(self, G: GraphView, set_identity: bool = True): has_modified_graph = False for matmul_node in [ params for params in G.nodes() if isinstance(params, MatMulOpParameters) ]: node_list = self.get_node_list(G, matmul_node) if node_list is None or len(node_list.order) < 2: continue LOG.info("fusing nodes %s", ",".join( (node.name for node in node_list.order))) has_modified_graph = True subgraph = GraphView() if node_list.active is not None: subgraph.add_edge( NNEdge(from_node=node_list.matmul, to_node=node_list.active)) input_mapping = [[(node_list.matmul, idx)] for idx in range(2)] if node_list.add: input_mapping += [[(node_list.matmul, 2)]] output_mapping = [(node_list.active, 0)] if node_list.active else [(node_list.matmul, 0)] pnode = MatMulOpFusionParameters(node_list.matmul.name + '_fusion', fusion_type=node_list.fusion_type, subgraph=subgraph, input_mapping=input_mapping, output_mapping=output_mapping) if G.quantization: qrecs = G.quantization.get_all(pnode.contained_nodes()) if qrecs: prec = None if isinstance(qrecs[0], (SymmetricQuantizationRecord, SymmetricScalableFilterQuantizationRecord)): prec = SymmetricQuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (MultQuantizationRecord, MultScalableFilterQuantizationRecord)): prec = MultQuantizationRecord(in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) elif isinstance(qrecs[0], (Float32QuantizationRecord, Float32ScalableFilterQuantizationRecord)): prec = Float32QuantizationRecord( in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) for node in pnode.contained_nodes(): G.quantization.move_to_fusion(node, pnode) G.quantization[NodeId(pnode)] = prec in_edges = G.in_edges(node_list.matmul.name) if node_list.add: bias_edge = [ add_edge for add_edge in G.in_edges(node_list.add.name) if isinstance(add_edge.from_node, ConstantInputParameters) ][0] out_edges = G.out_edges(node_list.order[-1].name) for node in node_list.order: G.remove(node) for edge in in_edges: G.add_edge( NNEdge(edge.from_node, pnode, from_idx=edge.from_idx, to_idx=edge.to_idx)) if node_list.add: G.add_edge( NNEdge(bias_edge.from_node, pnode, from_idx=bias_edge.from_idx, to_idx=2)) for edge in out_edges: G.add_edge( NNEdge(pnode, edge.to_node, from_idx=edge.from_idx, to_idx=edge.to_idx)) if set_identity: self.set_identity(G) return has_modified_graph
def _quantize(cls, params, in_qs, out_dtype, stats, **kwargs): o_q = SymmetricMultQType.from_min_max(min_val=stats['range_out'][0]['min'], max_val=stats['range_out'][0]['max'], dtype=out_dtype) return MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q])
def calculate_q(self, G, node, astats, in_qs, dtype, out_dtype=None): if out_dtype is None: out_dtype = dtype if isinstance(node, (PoolingParameters, OutputParameters, SplitParameters)): o_q = in_qs[0] elif isinstance(node, SoftMaxParameters): o_q = SymmetricMultQType(min_val=-1, max_val=1, dtype=np.int16, scale=2**(-15)) else: o_q = SymmetricMultQType.from_min_max(min_val=astats['range_out'][0]['min'], max_val=astats['range_out'][0]['max'], dtype=out_dtype) if isinstance(node, (MatrixAddParameters, MatrixSubParameters)): qrec = MultAddQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, ExpressionFusionParameters): o_qs = [SymmetricMultQType.from_min_max(min_val=orange['min'], max_val=orange['max'], dtype=out_dtype) for orange in astats['range_out']] fusion_inputs = sorted([n for n in node.subgraph.inputs() if isinstance(n, FusionInputParameters)], key=lambda x: x.idx) fusion_outputs = sorted([n for n in node.subgraph.outputs() if isinstance(n, FusionOutputParameters)], key=lambda x: x.idx) node_scale_map = {fnode: in_qs[idx].scale for idx, fnode in enumerate(fusion_inputs)} for idx, fnode in enumerate(fusion_outputs): node_scale_map[fnode] = o_qs[idx].scale inp, outp, expr = node.decompose(node_scale_map=node_scale_map) qrec = MultExpressionQuantizationRecord(in_qs=in_qs, out_qs=o_qs, inputs=inp, output_exprs=outp, intermediate_exprs=expr) elif isinstance(node, (MatrixBroadcastedLinearOpParameters, MatScaleFusionParameters, GlobalPoolParameters)): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) elif isinstance(node, SplitParameters): qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]*node.num_splits) elif isinstance(node, ConstantInputParameters): if node.value_quantization: qrec = MultConstantQuantizationRecord(out_qs=[node.value_quantization], constants_are_quantized=True) else: qrec = MultConstantQuantizationRecord(out_qs=[o_q], constants_are_quantized=False) elif isinstance(node, (FcParameters, Conv2DParameters)): weights_q = SymmetricMultQType.from_array(arr=node.weights, quantized_dimension=self.get_quantized_dimension( node), dtype=dtype, narrow_range=self._narrow_weights) if node.has_bias: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=weights_q.scale * in_qs[0].scale) else: biases_q = SymmetricMultBiasesQType( dtype=np.int32, scale=np.array([1], dtype=np.int32)) mul_biases_q = MultMulBiasScaleQType.from_filter(in_qs[0], weights_q, o_q, node) qrec = MultScalableFilterQuantizationRecord(in_qs=[in_qs[0]], out_qs=[o_q], weights_q=weights_q, biases_q=biases_q, mul_biases_q=mul_biases_q, constants_are_quantized=False) LOG.debug("filter %s qrec %s", node.name, qrec) elif isinstance(node, RNNParameters): input_nodes = {RNNParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(RNNParameters.INPUT_NAMES)} # quantization_mode: extended, autotiler # state_width: 16bit or 8bit opts = self.get_options(node) if opts['mode'] == "extended": in_w_scale = in_qs[names['i_2_i_w']].scale * in_qs[0].scale state_w_scale = in_qs[names['r_2_i_w']].scale i_2_a_q = MultMulBiasScaleQType(scale=in_w_scale/state_w_scale) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale) s_2_o_q = MultMulBiasScaleQType(scale=1/o_q.scale) self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], i_2_a_q=i_2_a_q, s_2_s_q=s_2_s_q, s_2_o_q=s_2_o_q ) elif opts['mode'] == 'autotiler': in_and_state_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_w_scale = np.maximum( in_qs[names['i_2_i_w']].scale, in_qs[names['r_2_i_w']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) self.rescale_constant(input_nodes['i_2_i_w'], in_and_state_w_scale) self.rescale_constant(input_nodes['r_2_i_w'], in_and_state_w_scale) state_w_scale = in_and_state_scale * in_and_state_w_scale self.rescale_constant(input_nodes['i_b'], state_w_scale, dtype=np.int32) s_2_s_q = MultMulBiasScaleQType(scale=state_w_scale/in_and_state_scale) qrec = MultScalableRnnQuantizationRecord( in_qs=in_qs, out_qs=[o_q], s_2_s_q=s_2_s_q, ) elif isinstance(node, LSTMParameters): input_nodes = {LSTMParameters.INPUT_NAMES[edge.to_idx]: edge.from_node for edge in G.in_edges(node.name) if isinstance(edge.from_node, ConstantInputParameters)} names = {val: idx for idx, val in enumerate(LSTMParameters.INPUT_NAMES)} if node.cell_clip: cell_max = node.cell_clip else: cell_max = max(abs(astats['range_cell'][var]) for var in ['min', 'max']) cell_int_bits = calc_bits(cell_max) in_qs[names['c_state']].recalculate_scale(-cell_max, cell_max) LOG.debug("cell bits %d max %d cell range %d", cell_int_bits, cell_max, in_qs[names['c_state']].range) # worst case is (internal_q * 3) + 2 = 32 (1 for 1 and 1 for sign) i.e. 10 # but also (internal_q * 2) + cell_bits = 32 int_q = min((32-cell_int_bits)//2, 10) # in and out and state are all in the same scale in_and_out_scale = np.maximum(in_qs[0].scale, o_q.scale) in_and_state_scale = np.maximum(in_and_out_scale, in_qs[names['i_state']].scale) in_qs[0].scale = in_and_state_scale o_q.scale = in_and_state_scale self.rescale_constant(input_nodes['i_state'], in_and_state_scale) scale_pairs = {chan: ('i_2_%s_w' % chan, 'r_2_%s_w' % chan) for chan in ['i', 'o', 'c', 'f']} scales = {k: np.maximum(in_qs[names[namei]].scale, in_qs[names[namer]].scale) for k, (namei, namer) in scale_pairs.items()} for k, (namei, namer) in scale_pairs.items(): self.rescale_constant(input_nodes[namei], scales[k]) self.rescale_constant(input_nodes[namer], scales[k]) int_scale = pow(2, -int_q) int2_scale = pow(2, -(int_q*2)) int3_scale = pow(2, -(int_q*3)) # compute scales for perceptrons pscales = {k: scales[k] * in_and_state_scale for k in ['i', 'o', 'c', 'f']} scale_qtypes = {"r_2_%s_q" % k: MultMulBiasScaleQType( scale=pscale/int_scale) for k, pscale in pscales.items()} scale_qtypes['cell_in_q'] = MultMulBiasScaleQType( scale=in_qs[names['c_state']].scale/int_scale) # TODO - Check cell clip here scale_qtypes['cell_out_q'] = MultMulBiasScaleQType( scale=int2_scale/in_qs[names['c_state']].scale) scale_qtypes['state_out_q'] = MultMulBiasScaleQType(scale=int3_scale/in_and_state_scale) # set internal scale scale_qtypes['i_qtype'] = QType(q=int_q, bits=32, signed=True) # set biases to output of perceptron for k in ['i', 'o', 'c', 'f']: self.rescale_constant(input_nodes["%s_b" % k], pscales[k], dtype=np.int32) qrec = MultScalableLstmQuantizationRecord( in_qs=in_qs, out_qs=[o_q], **scale_qtypes, ) else: qrec = MultQuantizationRecord(in_qs=in_qs, out_qs=[o_q]) return qrec