def quantize_weight_eightbit(input_node, quantization_mode): """Returns replacement nodes for input_node using the Dequantize op.""" base_name = input_node.name + "_" quint8_const_name = base_name + "quint8_const" min_name = base_name + "min" max_name = base_name + "max" float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor) min_value = np.min(float_tensor.flatten()) max_value = np.max(float_tensor.flatten()) # Make sure that the range includes zero. if min_value > 0.0: min_value = 0.0 # min_value == max_value is a tricky case. It can occur for general # tensors, and of course for scalars. The quantized ops cannot deal # with this case, so we set max_value to something else. # It's a tricky question what is the numerically best solution to # deal with this degeneracy. # TODO(petewarden): Better use a tolerance than a hard comparison? if min_value == max_value: if abs(min_value) < 0.000001: max_value = min_value + 1.0 elif min_value > 0: max_value = 2 * min_value else: max_value = min_value / 2.0 sess = session.Session() with sess.as_default(): quantize_op = array_ops.quantize_v2(float_tensor, min_value, max_value, dtypes.quint8, mode=quantization_mode) quint8_tensor = quantize_op[0].eval() min_value = quantize_op[1].eval() max_value = quantize_op[2].eval() shape = tensor_util.TensorShapeProtoToList( input_node.attr["value"].tensor.tensor_shape) quint8_const_node = create_constant_node(quint8_const_name, quint8_tensor, dtypes.quint8, shape=shape) dtype = dtypes.as_dtype(input_node.attr["dtype"].type) min_node = create_constant_node(min_name, min_value, dtypes.float32) max_node = create_constant_node(max_name, max_value, dtypes.float32) dequantize_node = create_node("Dequantize", input_node.name, [quint8_const_name, min_name, max_name]) set_attr_dtype(dequantize_node, "T", dtypes.quint8) set_attr_string(dequantize_node, "mode", quantization_mode) return [quint8_const_node, min_node, max_node, dequantize_node]
def intel_cpu_quantize_weight_eightbit(input_node, quantization_mode="SCALED"): """Returns replacement of constant weight node. This function creates (i) a quantized constant node, (ii) a float min node (iii) a float max node, and (iv) a dequantize node.""" base_name = input_node.name + "_" qint8_const_name = base_name + "qint8_const" min_name = base_name + "min" max_name = base_name + "max" float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor) min_value = np.min(float_tensor.flatten()) max_value = np.max(float_tensor.flatten()) # Same processing of min-max as in quantize_weight_eightbit function. if min_value > 0.0: min_value = 0.0 if min_value == max_value: if abs(min_value) < 0.000001: max_value = min_value + 1.0 elif min_value > 0: max_value = 2 * min_value else: max_value = min_value / 2.0 sess = session.Session() with sess.as_default(): quantize_op = array_ops.quantize_v2(float_tensor, min_value, max_value, dtypes.qint8, mode=quantization_mode, round_mode="HALF_TO_EVEN") qint8_tensor = quantize_op[0].eval() # Updated min-max values should be passed to the next feeding node. min_value = quantize_op[1].eval() max_value = quantize_op[2].eval() shape = tensor_util.TensorShapeProtoToList( input_node.attr["value"].tensor.tensor_shape) qint8_const_node = create_constant_node(qint8_const_name, qint8_tensor, dtypes.qint8, shape=shape) min_node = create_constant_node(min_name, min_value, dtypes.float32) max_node = create_constant_node(max_name, max_value, dtypes.float32) dequantize_node = create_node("Dequantize", input_node.name, [qint8_const_name, min_name, max_name]) set_attr_dtype(dequantize_node, "T", dtypes.quint8) set_attr_string(dequantize_node, "mode", b'SCALED') return [qint8_const_node, min_node, max_node, dequantize_node]
def quantize_weight_eightbit(input_node, quantization_mode): base_name = input_node.name + "_" quint8_const_name = base_name + "quint8_const" min_name = base_name + "min" max_name = base_name + "max" float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor) min_value = np.min(float_tensor.flatten()) max_value = np.max(float_tensor.flatten()) if min_value > 0.0: min_value = 0.0 if min_value == max_value: if abs(min_value) < 0.000001: max_value = min_value + 1.0 elif min_value > 0: max_value = 2 * min_value else: max_value = min_value / 2.0 sess = session.Session() with sess.as_default(): quantize_op = array_ops.quantize_v2(float_tensor, min_value, max_value, dtypes.quint8, mode=quantization_mode) quint8_tensor = quantize_op[0].eval() shape = tensor_util.TensorShapeProtoToList( input_node.attr["value"].tensor.tensor_shape) quint8_const_node = create_constant_node(quint8_const_name, quint8_tensor, dtypes.quint8, shape=shape) min_node = create_constant_node(min_name, min_value, dtypes.float32) max_node = create_constant_node(max_name, max_value, dtypes.float32) dequantize_node = create_node("Dequantize", input_node.name, [quint8_const_name, min_name, max_name]) set_attr_dtype(dequantize_node, "T", dtypes.quint8) set_attr_string(dequantize_node, "mode", quantization_mode) return [quint8_const_node, min_node, max_node, dequantize_node]
def _intel_cpu_quantize_weight_eightbit(self, parent, input_node, per_channel, quantization_mode=b"SCALED"): base_name = input_node.name + "_" qint8_const_name = base_name + "qint8_const" min_name = base_name + "min" max_name = base_name + "max" float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor) epsilon = 1e-4 # Needs to be set empirically if accuracy is not satisfactory range_coefficent = 127 / (2**self.weight_bit - 1) if parent in ("Conv2D", "MatMul"): if per_channel: ranges = np.abs(float_tensor).max(axis=(0, 1, 2)) ranges *= range_coefficent min_value = -ranges max_value = ranges # nudging min-max values outside epsilon radius around zero ranges[ranges < epsilon] = epsilon min_value[np.abs(min_value) < epsilon] = -epsilon max_value[np.abs(max_value) < epsilon] = epsilon qint8_tensor = (float_tensor * 127.0 / ranges).astype(np.int8) else: min_value = np.min(float_tensor.flatten()) max_value = np.max(float_tensor.flatten()) min_value *= range_coefficent max_value *= range_coefficent # Same processing of min-max as in quantize_weight_eightbit # function. if min_value > 0.0: min_value = 0.0 if min_value == max_value: if abs(min_value) < 0.000001: max_value = min_value + 1.0 elif min_value > 0: max_value = 2 * min_value else: max_value = min_value / 2.0 sess = tf.compat.v1.Session() with sess.as_default(): quantize_op = array_ops.quantize_v2( float_tensor, min_value, max_value, dtypes.qint8, mode=quantization_mode, round_mode="HALF_TO_EVEN") qint8_tensor = quantize_op[0].numpy( ) if tf.executing_eagerly() else quantize_op[0].eval() # Updated min-max values should be passed to the next # feeding node. min_value = quantize_op[1].numpy() if tf.executing_eagerly( ) else quantize_op[1].eval() max_value = quantize_op[2].numpy() if tf.executing_eagerly( ) else quantize_op[2].eval() sess.close() elif parent == "DepthwiseConv2dNative": # get the max values based on dim 0 and 1 for depthwise conv # since, the output channel will be dim 2 * dim 3 ranges = np.abs(float_tensor).max(axis=(0, 1)) ranges = ranges.flatten() min_value = -ranges max_value = ranges # nudging min-max values outside epsilon radius around zero ranges[ranges < epsilon] = epsilon min_value[np.abs(min_value) < epsilon] = -epsilon max_value[np.abs(max_value) < epsilon] = epsilon # Since output channel will be 1 dim which is dim 2 * dim 3 # When divide by range, qint8_tensor needs to be 3 dim # where, 3rd dim should be same dim of ranges a, b, c, d = float_tensor.shape qint8_tensor = (float_tensor.reshape(a, b, c * d) * 127.0 / ranges).astype(np.int8) # get the shape back to 4 dim qint8_tensor = qint8_tensor.reshape(a, b, c, d) shape = tensor_util.TensorShapeProtoToList( input_node.attr["value"].tensor.tensor_shape) qint8_const_node = helper.create_constant_node(qint8_const_name, qint8_tensor, dtypes.qint8, shape=shape) min_node = helper.create_constant_node(min_name, min_value, dtypes.float32, device=self.device) max_node = helper.create_constant_node(max_name, max_value, dtypes.float32, device=self.device) self.add_output_graph_node(qint8_const_node) self.add_output_graph_node(min_node) self.add_output_graph_node(max_node) return qint8_const_node.name, min_node.name, max_node.name
def do_transformation(self): g = GraphAnalyzer() g.graph = self.model graph_info = g.parse_graph() for i in self.rnn_details.keys(): # pragma: no cover start_node_name = graph_info[i[0]].node.input[0] min_str = i[0] + '_eightbit_min_' + \ start_node_name + '__print__;__min:' input_min_values = [] input_max_values = [] output_min_values = [] output_max_values = [] max_str = i[0] + '_eightbit_max_' + \ start_node_name + '__print__;__max:' output_str = i[0] + \ '_eightbit_requant_range__print__;__requant_min_max:' for j in self.calibration_data: if j.find(min_str) != -1: input_min_values.append( float(j.split('[')[-1].split(']')[0])) if j.find(max_str) != -1: input_max_values.append( float(j.split('[')[-1].split(']')[0])) if j.find(output_str) != -1: output_min_values.append( float(j.split(':')[-1][1:].split(']')[0])) output_max_values.append(float(j.split('][')[-1][:-1])) min_input = min(input_min_values) max_input = max(input_max_values) min_output = min(output_min_values) max_output = max(output_max_values) q_max_in_node = Helper.create_constant_node( i[0] + '_quant_max', max_input, dtypes.float32) q_min_in_node = Helper.create_constant_node( i[0] + '_quant_min', min_input, dtypes.float32) q_enter_min_node = Helper.create_node( 'Enter', q_min_in_node.name + '_enter', [q_min_in_node.name]) Helper.set_attr_string(q_enter_min_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_dtype(q_enter_min_node, 'T', dtypes.float32) Helper.set_attr_bool(q_enter_min_node, 'is_constant', True) Helper.set_attr_int(q_enter_min_node, 'parallel_iterations', 32) q_enter_max_node = Helper.create_node( 'Enter', q_max_in_node.name + '_enter', [q_max_in_node.name]) Helper.set_attr_dtype(q_enter_max_node, 'T', dtypes.float32) Helper.set_attr_string(q_enter_max_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_bool(q_enter_max_node, 'is_constant', True) Helper.set_attr_int(q_enter_max_node, 'parallel_iterations', 32) split_node_name = graph_info[i[0]].node.input[1] enter_node_name = graph_info[Helper.node_name_from_input( split_node_name)].node.input[1] weight_node_name = graph_info[Helper.node_name_from_input( enter_node_name)].node.input[0] weight_node = graph_info[Helper.node_name_from_input( weight_node_name)].node if weight_node.attr['dtype'].type == dtypes.qint8: qint8_const_name = weight_node_name else: base_name = weight_node_name + "_" qint8_const_name = base_name + "qint8_const" min_name = base_name + "min" max_name = base_name + "max" need_to_create_const_node = bool( qint8_const_name not in graph_info) if need_to_create_const_node: float_tensor = tensor_util.MakeNdarray( weight_node.attr["value"].tensor) min_value = np.min(float_tensor.flatten()) max_value = np.max(float_tensor.flatten()) # Same processing of min-max as in quantize_weight_eightbit # function. if min_value > 0.0: min_value = 0.0 if min_value == max_value: if abs(min_value) < 0.000001: max_value = min_value + 1.0 elif min_value > 0: max_value = 2 * min_value else: max_value = min_value / 2.0 sess = tf.compat.v1.Session() with sess.as_default(): quantize_op = array_ops.quantize_v2( float_tensor, min_value, max_value, dtypes.qint8, mode='SCALED', round_mode="HALF_TO_EVEN") qint8_tensor = quantize_op[0].numpy( ) if tf.executing_eagerly() else quantize_op[0].eval() # Updated min-max values should be passed to the next # feeding node. min_value = quantize_op[1].numpy() if tf.executing_eagerly( ) else quantize_op[1].eval() max_value = quantize_op[2].numpy() if tf.executing_eagerly( ) else quantize_op[2].eval() sess.close() shape = tensor_util.TensorShapeProtoToList( weight_node.attr["value"].tensor.tensor_shape) qint8_const_node = Helper.create_constant_node( qint8_const_name, qint8_tensor, dtypes.qint8, shape=shape) min_node = Helper.create_constant_node(min_name, min_value, dtypes.float32) max_node = Helper.create_constant_node(max_name, max_value, dtypes.float32) enter_min_node = Helper.create_node('Enter', min_name + '_enter', [min_name]) Helper.set_attr_string(enter_min_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_dtype(enter_min_node, 'T', dtypes.float32) Helper.set_attr_bool(enter_min_node, 'is_constant', True) Helper.set_attr_int(enter_min_node, 'parallel_iterations', 32) enter_max_node = Helper.create_node('Enter', max_name + '_enter', [max_name]) Helper.set_attr_dtype(enter_max_node, 'T', dtypes.float32) Helper.set_attr_string(enter_max_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_bool(enter_max_node, 'is_constant', True) Helper.set_attr_int(enter_max_node, 'parallel_iterations', 32) else: qint8_const_node = graph_info[qint8_const_name].node min_node = graph_info[min_name].node max_node = graph_info[max_name].node quant_input = [ start_node_name, q_enter_min_node.name, q_enter_max_node.name ] quantize_node = Helper.create_node('QuantizeV2', i[0] + '_quantize', quant_input) Helper.set_attr_dtype(quantize_node, "T", dtypes.quint8) Helper.set_attr_string(quantize_node, "mode", b"MIN_FIRST") g.add_node(quantize_node, start_node_name, [i[0]]) g.add_node(q_enter_max_node, None, [quantize_node.name]) g.add_node(q_enter_min_node, None, [quantize_node.name]) g.add_node(q_max_in_node, None, [q_enter_max_node.name]) g.add_node(q_min_in_node, None, [q_enter_min_node.name]) bias_node = graph_info[graph_info[i[0]].outputs[0]].node if graph_info[bias_node.name].outputs: last_node_name = [ graph_info[graph_info[bias_node.name].outputs[0]].node.name ] else: last_node_name = [] quantized_matmul_input = [ quantize_node.name, Helper.node_name_from_input(graph_info[i[0]].node.input[1]), bias_node.input[1] ] quantized_matmul_input.append(quantize_node.name + ':1') quantized_matmul_input.append(quantize_node.name + ':2') quantized_matmul_input.append(enter_min_node.name) quantized_matmul_input.append(enter_max_node.name) quantized_matmul_with_bias_node = Helper.create_node( 'QuantizedMatMulWithBias', i[0] + '_quantized_mat_mul', quantized_matmul_input) Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'T1', dtypes.quint8) Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'T2', dtypes.qint8) Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'Tbias', dtypes.float32) Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'Toutput', dtypes.qint32) Helper.set_attr_bool(quantized_matmul_with_bias_node, 'transpose_a', False) Helper.set_attr_bool(quantized_matmul_with_bias_node, 'transpose_b', False) Helper.set_attr_string(quantized_matmul_with_bias_node, 'input_quant_mode', b"MIN_FIRST") g.add_node(quantized_matmul_with_bias_node, quantize_node.name, [bias_node.name]) if qint8_const_node.name not in graph_info: g.add_node(qint8_const_node, None, [enter_node_name]) enter_node = graph_info[enter_node_name].node split_node = graph_info[Helper.node_name_from_input( split_node_name)].node Helper.set_attr_dtype(enter_node, 'T', dtypes.qint8) Helper.set_attr_dtype(split_node, 'T', dtypes.qint8) graph_info[ enter_node.name].node.input[0] = qint8_const_node.name elif qint8_const_node.name in graph_info: pass else: g.add_node(qint8_const_node, None, [quantized_matmul_with_bias_node.name]) if need_to_create_const_node: g.add_node(enter_min_node, None, [quantized_matmul_with_bias_node.name]) g.add_node(enter_max_node, None, [quantized_matmul_with_bias_node.name]) g.add_node(min_node, None, [enter_min_node.name]) g.add_node(max_node, None, [enter_max_node.name]) # create requantize node requantize_min_node = Helper.create_constant_node( i[0] + 'requant_w_min', min_output, dtypes.float32) requantize_max_node = Helper.create_constant_node( i[0] + 'requant_w_max', max_output, dtypes.float32) enter_req_min_node = Helper.create_node( 'Enter', requantize_min_node.name + '_enter', [requantize_min_node.name]) Helper.set_attr_string(enter_req_min_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_dtype(enter_req_min_node, 'T', dtypes.float32) Helper.set_attr_bool(enter_req_min_node, 'is_constant', True) Helper.set_attr_int(enter_req_min_node, 'parallel_iterations', 32) enter_req_max_node = Helper.create_node( 'Enter', requantize_max_node.name + '_enter', [requantize_max_node.name]) Helper.set_attr_dtype(enter_req_max_node, 'T', dtypes.float32) Helper.set_attr_string(enter_req_max_node, 'frame_name', self.rnn_details[i].encode()) Helper.set_attr_bool(enter_req_max_node, 'is_constant', True) Helper.set_attr_int(enter_req_max_node, 'parallel_iterations', 32) requantize_input = [ quantized_matmul_with_bias_node.name, quantized_matmul_with_bias_node.name + ':1', quantized_matmul_with_bias_node.name + ':2', enter_req_min_node.name, enter_req_max_node.name ] requantize_node = Helper.create_node('Requantize', i[0] + '_requantize', requantize_input) Helper.set_attr_dtype(requantize_node, 'out_type', dtypes.qint8) Helper.set_attr_dtype(requantize_node, 'Tinput', dtypes.qint32) g.add_node(requantize_node, quantized_matmul_with_bias_node.name, [bias_node.name]) dequantize_input = [ requantize_node.name, requantize_node.name + ':1', requantize_node.name + ':2' ] dequantize_node = Helper.create_node('Dequantize', i[0] + '_dequantize', dequantize_input) Helper.set_attr_dtype(dequantize_node, "T", dtypes.qint8) Helper.set_attr_dtype(dequantize_node, "dtype", dtypes.float32) Helper.set_attr_string(dequantize_node, "mode", b"MIN_FIRST") g.add_node(enter_req_min_node, None, [requantize_node.name]) g.add_node(enter_req_max_node, None, [requantize_node.name]) g.add_node(requantize_min_node, None, [enter_req_min_node.name]) g.add_node(requantize_max_node, None, [enter_req_max_node.name]) g.add_node(dequantize_node, requantize_node.name, last_node_name) if last_node_name: graph_info[ last_node_name[0]].node.input[0] = dequantize_node.name g.remove_node(bias_node.name) g.remove_node(i[0]) # g.remove_node(weight_node_name) return g.dump_graph()