Exemplo n.º 1
0
def quantize_weight_eightbit(input_node, quantization_mode):
    """Returns replacement nodes for input_node using the Dequantize op."""
    base_name = input_node.name + "_"
    quint8_const_name = base_name + "quint8_const"
    min_name = base_name + "min"
    max_name = base_name + "max"
    float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor)
    min_value = np.min(float_tensor.flatten())
    max_value = np.max(float_tensor.flatten())
    # Make sure that the range includes zero.
    if min_value > 0.0:
        min_value = 0.0
    # min_value == max_value is a tricky case. It can occur for general
    # tensors, and of course for scalars. The quantized ops cannot deal
    # with this case, so we set max_value to something else.
    # It's a tricky question what is the numerically best solution to
    # deal with this degeneracy.
    # TODO(petewarden): Better use a tolerance than a hard comparison?
    if min_value == max_value:
        if abs(min_value) < 0.000001:
            max_value = min_value + 1.0
        elif min_value > 0:
            max_value = 2 * min_value
        else:
            max_value = min_value / 2.0

    sess = session.Session()
    with sess.as_default():
        quantize_op = array_ops.quantize_v2(float_tensor,
                                            min_value,
                                            max_value,
                                            dtypes.quint8,
                                            mode=quantization_mode)
        quint8_tensor = quantize_op[0].eval()
        min_value = quantize_op[1].eval()
        max_value = quantize_op[2].eval()
    shape = tensor_util.TensorShapeProtoToList(
        input_node.attr["value"].tensor.tensor_shape)
    quint8_const_node = create_constant_node(quint8_const_name,
                                             quint8_tensor,
                                             dtypes.quint8,
                                             shape=shape)

    dtype = dtypes.as_dtype(input_node.attr["dtype"].type)
    min_node = create_constant_node(min_name, min_value, dtypes.float32)
    max_node = create_constant_node(max_name, max_value, dtypes.float32)
    dequantize_node = create_node("Dequantize", input_node.name,
                                  [quint8_const_name, min_name, max_name])
    set_attr_dtype(dequantize_node, "T", dtypes.quint8)
    set_attr_string(dequantize_node, "mode", quantization_mode)
    return [quint8_const_node, min_node, max_node, dequantize_node]
Exemplo n.º 2
0
def intel_cpu_quantize_weight_eightbit(input_node, quantization_mode="SCALED"):
    """Returns replacement of constant weight node.

    This function creates (i) a quantized constant node, (ii) a float min node
    (iii) a float max node, and (iv) a dequantize node."""
    base_name = input_node.name + "_"
    qint8_const_name = base_name + "qint8_const"
    min_name = base_name + "min"
    max_name = base_name + "max"
    float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor)
    min_value = np.min(float_tensor.flatten())
    max_value = np.max(float_tensor.flatten())
    # Same processing of min-max as in quantize_weight_eightbit function.
    if min_value > 0.0:
        min_value = 0.0
    if min_value == max_value:
        if abs(min_value) < 0.000001:
            max_value = min_value + 1.0
        elif min_value > 0:
            max_value = 2 * min_value
        else:
            max_value = min_value / 2.0

    sess = session.Session()
    with sess.as_default():
        quantize_op = array_ops.quantize_v2(float_tensor,
                                            min_value,
                                            max_value,
                                            dtypes.qint8,
                                            mode=quantization_mode,
                                            round_mode="HALF_TO_EVEN")
        qint8_tensor = quantize_op[0].eval()
        # Updated min-max values should be passed to the next feeding node.
        min_value = quantize_op[1].eval()
        max_value = quantize_op[2].eval()
    shape = tensor_util.TensorShapeProtoToList(
        input_node.attr["value"].tensor.tensor_shape)
    qint8_const_node = create_constant_node(qint8_const_name,
                                            qint8_tensor,
                                            dtypes.qint8,
                                            shape=shape)
    min_node = create_constant_node(min_name, min_value, dtypes.float32)
    max_node = create_constant_node(max_name, max_value, dtypes.float32)

    dequantize_node = create_node("Dequantize", input_node.name,
                                  [qint8_const_name, min_name, max_name])
    set_attr_dtype(dequantize_node, "T", dtypes.quint8)
    set_attr_string(dequantize_node, "mode", b'SCALED')
    return [qint8_const_node, min_node, max_node, dequantize_node]
Exemplo n.º 3
0
def quantize_weight_eightbit(input_node, quantization_mode):
    base_name = input_node.name + "_"
    quint8_const_name = base_name + "quint8_const"
    min_name = base_name + "min"
    max_name = base_name + "max"
    float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor)
    min_value = np.min(float_tensor.flatten())
    max_value = np.max(float_tensor.flatten())
    if min_value > 0.0:
        min_value = 0.0
    if min_value == max_value:
        if abs(min_value) < 0.000001:
            max_value = min_value + 1.0
        elif min_value > 0:
            max_value = 2 * min_value
        else:
            max_value = min_value / 2.0

    sess = session.Session()
    with sess.as_default():
        quantize_op = array_ops.quantize_v2(float_tensor,
                                            min_value,
                                            max_value,
                                            dtypes.quint8,
                                            mode=quantization_mode)
        quint8_tensor = quantize_op[0].eval()
    shape = tensor_util.TensorShapeProtoToList(
        input_node.attr["value"].tensor.tensor_shape)
    quint8_const_node = create_constant_node(quint8_const_name,
                                             quint8_tensor,
                                             dtypes.quint8,
                                             shape=shape)
    min_node = create_constant_node(min_name, min_value, dtypes.float32)
    max_node = create_constant_node(max_name, max_value, dtypes.float32)
    dequantize_node = create_node("Dequantize", input_node.name,
                                  [quint8_const_name, min_name, max_name])
    set_attr_dtype(dequantize_node, "T", dtypes.quint8)
    set_attr_string(dequantize_node, "mode", quantization_mode)
    return [quint8_const_node, min_node, max_node, dequantize_node]
Exemplo n.º 4
0
    def _intel_cpu_quantize_weight_eightbit(self,
                                            parent,
                                            input_node,
                                            per_channel,
                                            quantization_mode=b"SCALED"):
        base_name = input_node.name + "_"
        qint8_const_name = base_name + "qint8_const"
        min_name = base_name + "min"
        max_name = base_name + "max"
        float_tensor = tensor_util.MakeNdarray(input_node.attr["value"].tensor)
        epsilon = 1e-4  # Needs to be set empirically if accuracy is not satisfactory
        range_coefficent = 127 / (2**self.weight_bit - 1)
        if parent in ("Conv2D", "MatMul"):
            if per_channel:
                ranges = np.abs(float_tensor).max(axis=(0, 1, 2))
                ranges *= range_coefficent
                min_value = -ranges
                max_value = ranges
                # nudging min-max values outside epsilon radius around zero
                ranges[ranges < epsilon] = epsilon
                min_value[np.abs(min_value) < epsilon] = -epsilon
                max_value[np.abs(max_value) < epsilon] = epsilon
                qint8_tensor = (float_tensor * 127.0 / ranges).astype(np.int8)
            else:
                min_value = np.min(float_tensor.flatten())
                max_value = np.max(float_tensor.flatten())
                min_value *= range_coefficent
                max_value *= range_coefficent
                # Same processing of min-max as in quantize_weight_eightbit
                # function.
                if min_value > 0.0:
                    min_value = 0.0
                if min_value == max_value:
                    if abs(min_value) < 0.000001:
                        max_value = min_value + 1.0
                    elif min_value > 0:
                        max_value = 2 * min_value
                    else:
                        max_value = min_value / 2.0

                sess = tf.compat.v1.Session()
                with sess.as_default():
                    quantize_op = array_ops.quantize_v2(
                        float_tensor,
                        min_value,
                        max_value,
                        dtypes.qint8,
                        mode=quantization_mode,
                        round_mode="HALF_TO_EVEN")
                    qint8_tensor = quantize_op[0].numpy(
                    ) if tf.executing_eagerly() else quantize_op[0].eval()
                    # Updated min-max values should be passed to the next
                    # feeding node.
                    min_value = quantize_op[1].numpy() if tf.executing_eagerly(
                    ) else quantize_op[1].eval()
                    max_value = quantize_op[2].numpy() if tf.executing_eagerly(
                    ) else quantize_op[2].eval()
                sess.close()
        elif parent == "DepthwiseConv2dNative":
            # get the max values based on dim 0 and 1 for depthwise conv
            # since, the output channel will be dim 2 * dim 3
            ranges = np.abs(float_tensor).max(axis=(0, 1))
            ranges = ranges.flatten()
            min_value = -ranges
            max_value = ranges
            # nudging min-max values outside epsilon radius around zero
            ranges[ranges < epsilon] = epsilon
            min_value[np.abs(min_value) < epsilon] = -epsilon
            max_value[np.abs(max_value) < epsilon] = epsilon
            # Since output channel will be 1 dim which is dim 2 * dim 3
            # When divide by range, qint8_tensor needs to be 3 dim
            # where, 3rd dim should be same dim of ranges
            a, b, c, d = float_tensor.shape
            qint8_tensor = (float_tensor.reshape(a, b, c * d) * 127.0 /
                            ranges).astype(np.int8)
            # get the shape back to 4 dim
            qint8_tensor = qint8_tensor.reshape(a, b, c, d)
        shape = tensor_util.TensorShapeProtoToList(
            input_node.attr["value"].tensor.tensor_shape)
        qint8_const_node = helper.create_constant_node(qint8_const_name,
                                                       qint8_tensor,
                                                       dtypes.qint8,
                                                       shape=shape)

        min_node = helper.create_constant_node(min_name,
                                               min_value,
                                               dtypes.float32,
                                               device=self.device)

        max_node = helper.create_constant_node(max_name,
                                               max_value,
                                               dtypes.float32,
                                               device=self.device)

        self.add_output_graph_node(qint8_const_node)
        self.add_output_graph_node(min_node)
        self.add_output_graph_node(max_node)

        return qint8_const_node.name, min_node.name, max_node.name
Exemplo n.º 5
0
    def do_transformation(self):
        g = GraphAnalyzer()
        g.graph = self.model
        graph_info = g.parse_graph()

        for i in self.rnn_details.keys():  # pragma: no cover
            start_node_name = graph_info[i[0]].node.input[0]
            min_str = i[0] + '_eightbit_min_' + \
                start_node_name + '__print__;__min:'
            input_min_values = []
            input_max_values = []
            output_min_values = []
            output_max_values = []
            max_str = i[0] + '_eightbit_max_' + \
                start_node_name + '__print__;__max:'
            output_str = i[0] + \
                '_eightbit_requant_range__print__;__requant_min_max:'
            for j in self.calibration_data:
                if j.find(min_str) != -1:
                    input_min_values.append(
                        float(j.split('[')[-1].split(']')[0]))
                if j.find(max_str) != -1:
                    input_max_values.append(
                        float(j.split('[')[-1].split(']')[0]))

                if j.find(output_str) != -1:
                    output_min_values.append(
                        float(j.split(':')[-1][1:].split(']')[0]))
                    output_max_values.append(float(j.split('][')[-1][:-1]))
            min_input = min(input_min_values)
            max_input = max(input_max_values)
            min_output = min(output_min_values)
            max_output = max(output_max_values)
            q_max_in_node = Helper.create_constant_node(
                i[0] + '_quant_max', max_input, dtypes.float32)

            q_min_in_node = Helper.create_constant_node(
                i[0] + '_quant_min', min_input, dtypes.float32)
            q_enter_min_node = Helper.create_node(
                'Enter', q_min_in_node.name + '_enter', [q_min_in_node.name])
            Helper.set_attr_string(q_enter_min_node, 'frame_name',
                                   self.rnn_details[i].encode())
            Helper.set_attr_dtype(q_enter_min_node, 'T', dtypes.float32)
            Helper.set_attr_bool(q_enter_min_node, 'is_constant', True)
            Helper.set_attr_int(q_enter_min_node, 'parallel_iterations', 32)
            q_enter_max_node = Helper.create_node(
                'Enter', q_max_in_node.name + '_enter', [q_max_in_node.name])
            Helper.set_attr_dtype(q_enter_max_node, 'T', dtypes.float32)
            Helper.set_attr_string(q_enter_max_node, 'frame_name',
                                   self.rnn_details[i].encode())
            Helper.set_attr_bool(q_enter_max_node, 'is_constant', True)
            Helper.set_attr_int(q_enter_max_node, 'parallel_iterations', 32)

            split_node_name = graph_info[i[0]].node.input[1]
            enter_node_name = graph_info[Helper.node_name_from_input(
                split_node_name)].node.input[1]
            weight_node_name = graph_info[Helper.node_name_from_input(
                enter_node_name)].node.input[0]
            weight_node = graph_info[Helper.node_name_from_input(
                weight_node_name)].node
            if weight_node.attr['dtype'].type == dtypes.qint8:
                qint8_const_name = weight_node_name
            else:
                base_name = weight_node_name + "_"
                qint8_const_name = base_name + "qint8_const"
                min_name = base_name + "min"
                max_name = base_name + "max"

            need_to_create_const_node = bool(
                qint8_const_name not in graph_info)
            if need_to_create_const_node:
                float_tensor = tensor_util.MakeNdarray(
                    weight_node.attr["value"].tensor)

                min_value = np.min(float_tensor.flatten())
                max_value = np.max(float_tensor.flatten())
                # Same processing of min-max as in quantize_weight_eightbit
                # function.
                if min_value > 0.0:
                    min_value = 0.0
                if min_value == max_value:
                    if abs(min_value) < 0.000001:
                        max_value = min_value + 1.0
                    elif min_value > 0:
                        max_value = 2 * min_value
                    else:
                        max_value = min_value / 2.0

                sess = tf.compat.v1.Session()
                with sess.as_default():
                    quantize_op = array_ops.quantize_v2(
                        float_tensor,
                        min_value,
                        max_value,
                        dtypes.qint8,
                        mode='SCALED',
                        round_mode="HALF_TO_EVEN")
                    qint8_tensor = quantize_op[0].numpy(
                    ) if tf.executing_eagerly() else quantize_op[0].eval()
                    # Updated min-max values should be passed to the next
                    # feeding node.
                    min_value = quantize_op[1].numpy() if tf.executing_eagerly(
                    ) else quantize_op[1].eval()
                    max_value = quantize_op[2].numpy() if tf.executing_eagerly(
                    ) else quantize_op[2].eval()
                sess.close()

                shape = tensor_util.TensorShapeProtoToList(
                    weight_node.attr["value"].tensor.tensor_shape)
                qint8_const_node = Helper.create_constant_node(
                    qint8_const_name, qint8_tensor, dtypes.qint8, shape=shape)

                min_node = Helper.create_constant_node(min_name, min_value,
                                                       dtypes.float32)

                max_node = Helper.create_constant_node(max_name, max_value,
                                                       dtypes.float32)
                enter_min_node = Helper.create_node('Enter',
                                                    min_name + '_enter',
                                                    [min_name])
                Helper.set_attr_string(enter_min_node, 'frame_name',
                                       self.rnn_details[i].encode())
                Helper.set_attr_dtype(enter_min_node, 'T', dtypes.float32)
                Helper.set_attr_bool(enter_min_node, 'is_constant', True)
                Helper.set_attr_int(enter_min_node, 'parallel_iterations', 32)
                enter_max_node = Helper.create_node('Enter',
                                                    max_name + '_enter',
                                                    [max_name])
                Helper.set_attr_dtype(enter_max_node, 'T', dtypes.float32)
                Helper.set_attr_string(enter_max_node, 'frame_name',
                                       self.rnn_details[i].encode())
                Helper.set_attr_bool(enter_max_node, 'is_constant', True)
                Helper.set_attr_int(enter_max_node, 'parallel_iterations', 32)
            else:
                qint8_const_node = graph_info[qint8_const_name].node
                min_node = graph_info[min_name].node
                max_node = graph_info[max_name].node
            quant_input = [
                start_node_name, q_enter_min_node.name, q_enter_max_node.name
            ]
            quantize_node = Helper.create_node('QuantizeV2',
                                               i[0] + '_quantize', quant_input)
            Helper.set_attr_dtype(quantize_node, "T", dtypes.quint8)
            Helper.set_attr_string(quantize_node, "mode", b"MIN_FIRST")
            g.add_node(quantize_node, start_node_name, [i[0]])
            g.add_node(q_enter_max_node, None, [quantize_node.name])
            g.add_node(q_enter_min_node, None, [quantize_node.name])
            g.add_node(q_max_in_node, None, [q_enter_max_node.name])
            g.add_node(q_min_in_node, None, [q_enter_min_node.name])

            bias_node = graph_info[graph_info[i[0]].outputs[0]].node
            if graph_info[bias_node.name].outputs:
                last_node_name = [
                    graph_info[graph_info[bias_node.name].outputs[0]].node.name
                ]
            else:
                last_node_name = []
            quantized_matmul_input = [
                quantize_node.name,
                Helper.node_name_from_input(graph_info[i[0]].node.input[1]),
                bias_node.input[1]
            ]
            quantized_matmul_input.append(quantize_node.name + ':1')
            quantized_matmul_input.append(quantize_node.name + ':2')

            quantized_matmul_input.append(enter_min_node.name)
            quantized_matmul_input.append(enter_max_node.name)
            quantized_matmul_with_bias_node = Helper.create_node(
                'QuantizedMatMulWithBias', i[0] + '_quantized_mat_mul',
                quantized_matmul_input)
            Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'T1',
                                  dtypes.quint8)
            Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'T2',
                                  dtypes.qint8)
            Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'Tbias',
                                  dtypes.float32)
            Helper.set_attr_dtype(quantized_matmul_with_bias_node, 'Toutput',
                                  dtypes.qint32)
            Helper.set_attr_bool(quantized_matmul_with_bias_node,
                                 'transpose_a', False)
            Helper.set_attr_bool(quantized_matmul_with_bias_node,
                                 'transpose_b', False)
            Helper.set_attr_string(quantized_matmul_with_bias_node,
                                   'input_quant_mode', b"MIN_FIRST")
            g.add_node(quantized_matmul_with_bias_node, quantize_node.name,
                       [bias_node.name])

            if qint8_const_node.name not in graph_info:
                g.add_node(qint8_const_node, None, [enter_node_name])
                enter_node = graph_info[enter_node_name].node
                split_node = graph_info[Helper.node_name_from_input(
                    split_node_name)].node
                Helper.set_attr_dtype(enter_node, 'T', dtypes.qint8)
                Helper.set_attr_dtype(split_node, 'T', dtypes.qint8)
                graph_info[
                    enter_node.name].node.input[0] = qint8_const_node.name
            elif qint8_const_node.name in graph_info:
                pass
            else:
                g.add_node(qint8_const_node, None,
                           [quantized_matmul_with_bias_node.name])

            if need_to_create_const_node:
                g.add_node(enter_min_node, None,
                           [quantized_matmul_with_bias_node.name])
                g.add_node(enter_max_node, None,
                           [quantized_matmul_with_bias_node.name])
                g.add_node(min_node, None, [enter_min_node.name])
                g.add_node(max_node, None, [enter_max_node.name])

            # create requantize node
            requantize_min_node = Helper.create_constant_node(
                i[0] + 'requant_w_min', min_output, dtypes.float32)
            requantize_max_node = Helper.create_constant_node(
                i[0] + 'requant_w_max', max_output, dtypes.float32)

            enter_req_min_node = Helper.create_node(
                'Enter', requantize_min_node.name + '_enter',
                [requantize_min_node.name])
            Helper.set_attr_string(enter_req_min_node, 'frame_name',
                                   self.rnn_details[i].encode())
            Helper.set_attr_dtype(enter_req_min_node, 'T', dtypes.float32)
            Helper.set_attr_bool(enter_req_min_node, 'is_constant', True)
            Helper.set_attr_int(enter_req_min_node, 'parallel_iterations', 32)

            enter_req_max_node = Helper.create_node(
                'Enter', requantize_max_node.name + '_enter',
                [requantize_max_node.name])
            Helper.set_attr_dtype(enter_req_max_node, 'T', dtypes.float32)
            Helper.set_attr_string(enter_req_max_node, 'frame_name',
                                   self.rnn_details[i].encode())
            Helper.set_attr_bool(enter_req_max_node, 'is_constant', True)
            Helper.set_attr_int(enter_req_max_node, 'parallel_iterations', 32)
            requantize_input = [
                quantized_matmul_with_bias_node.name,
                quantized_matmul_with_bias_node.name + ':1',
                quantized_matmul_with_bias_node.name + ':2',
                enter_req_min_node.name, enter_req_max_node.name
            ]
            requantize_node = Helper.create_node('Requantize',
                                                 i[0] + '_requantize',
                                                 requantize_input)
            Helper.set_attr_dtype(requantize_node, 'out_type', dtypes.qint8)
            Helper.set_attr_dtype(requantize_node, 'Tinput', dtypes.qint32)

            g.add_node(requantize_node, quantized_matmul_with_bias_node.name,
                       [bias_node.name])
            dequantize_input = [
                requantize_node.name, requantize_node.name + ':1',
                requantize_node.name + ':2'
            ]
            dequantize_node = Helper.create_node('Dequantize',
                                                 i[0] + '_dequantize',
                                                 dequantize_input)
            Helper.set_attr_dtype(dequantize_node, "T", dtypes.qint8)
            Helper.set_attr_dtype(dequantize_node, "dtype", dtypes.float32)
            Helper.set_attr_string(dequantize_node, "mode", b"MIN_FIRST")

            g.add_node(enter_req_min_node, None, [requantize_node.name])
            g.add_node(enter_req_max_node, None, [requantize_node.name])
            g.add_node(requantize_min_node, None, [enter_req_min_node.name])
            g.add_node(requantize_max_node, None, [enter_req_max_node.name])
            g.add_node(dequantize_node, requantize_node.name, last_node_name)
            if last_node_name:
                graph_info[
                    last_node_name[0]].node.input[0] = dequantize_node.name
            g.remove_node(bias_node.name)
            g.remove_node(i[0])

            # g.remove_node(weight_node_name)

        return g.dump_graph()