示例#1
0
    def test_sparse_qlinear_serdes(self):
        # Note: At the moment, for sparse kernels
        # fbgemm supports only static quantized sparse linear
        # qnnpack supports only dynamically quantized sparse linear
        # Hence we have two different tests.
        # fbgemm tests static flow, qnnpack tests dynamic.
        # Should be unified later on and tests should be fixed
        # appropriately.
        model_class = SparseQuantizedModel
        fqn_to_check = "linear"
        if qengine_is_fbgemm():
            sparse_mapping = tq.get_default_static_sparse_quant_module_mappings(
            )
            ref_mapping = tq.get_default_static_quant_module_mappings()
            qconfig_dict = {nn.Linear: tq.get_default_qconfig("fbgemm")}
        elif qengine_is_qnnpack():
            sparse_mapping = tq.get_default_dynamic_sparse_quant_module_mappings(
            )
            ref_mapping = tq.get_default_dynamic_quant_module_mappings()
            qconfig_dict = {nn.Linear: tq.qconfig.default_dynamic_qconfig}
        else:
            return

        _sparse_layer_test_helper(
            model_class=model_class,
            sparse_mapping=sparse_mapping,
            ref_mapping=ref_mapping,
            qconfig_dict=qconfig_dict,
            fqn_to_check=fqn_to_check,
            test_class=self,
            test_scripting=True,
        )
示例#2
0
 def test_qlinear_packed_params_qnnpack(self):
     torch.manual_seed(0)
     with override_quantized_engine('qnnpack'):
         with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
             self.test_qlinear_packed_params(
                 allow_non_zero_zero_points=True)
示例#3
0
    def test_qlinear_packed_params(self, allow_non_zero_zero_points=False):
        # copied from https://pytorch.org/docs/stable/sparse.html#csr-tensor-operations,
        # so row/col block indices match that example, but with blocks and
        # scaled rows
        weight_fp32 = torch.Tensor([
            [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0],
            [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ])

        row_block_size = 1
        col_block_size = 4
        out_features = weight_fp32.shape[0]
        in_features = weight_fp32.shape[1]

        scales = [2.0, 6.0, 12.0]
        zero_points = [((i + 1) if allow_non_zero_zero_points else 0)
                       for i in range(out_features)]
        dtype = torch.qint8

        wide_weight_fp32 = torch.zeros(
            (3, 4008))  # 4000 is tile width for Fbgemm
        wide_weight_fp32[0][0] = 4
        wide_weight_fp32[0][4004] = 6
        wide_weight_fp32[1][0] = 8

        per_tensor_small = (
            torch.quantize_per_tensor(weight_fp32, scales[0], zero_points[0],
                                      dtype),
            True,
            [0, 1, 3, 3],
            [2, 0, 1],
            [
                x + (1 if allow_non_zero_zero_points else 0)
                for x in [1, 1, 1, 1, 3, 3, 3, 3, 6, 6, 6, 6]
            ],
        )

        per_channel_small = (
            torch.quantize_per_channel(
                weight_fp32,
                torch.Tensor(scales),
                torch.Tensor(zero_points).to(torch.int),
                0,  # axis = 0
                dtype,
            ),
            False,
            [0, 1, 3, 3],
            [2, 0, 1],
            [
                x + ([1, 2, 2][i // 4] if allow_non_zero_zero_points else 0)
                for (i, x) in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2])
            ],
        )

        per_tensor_large = (
            torch.quantize_per_tensor(
                wide_weight_fp32,
                scales[0],
                zero_points[0],
                dtype,
            ),
            True,
            [0, 2, 3, 3],
            [0, 1001, 0],
            [
                x + (1 if allow_non_zero_zero_points else 0)
                for x in [2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0]
            ],
        )

        for (weight, is_per_tensor_quantized, expected_row_block_indices,
             expected_col_block_indices, expected_weights) in [
                 per_tensor_small, per_channel_small, per_tensor_large
             ]:
            lin = Linear(
                out_features=weight.shape[0],
                in_features=weight.shape[1],
                row_block_size=row_block_size,
                col_block_size=col_block_size,
                bias=True,
                dtype=dtype,
            )

            bias = torch.ones(size=(weight.shape[0], ))

            lin.set_weight_bias(weight, bias, row_block_size, col_block_size)

            serialized = lin._packed_params._packed_params.__getstate__()

            (
                _,  # version
                bias_,
                out_features_block_size_,
                in_features_block_size_,
                weight_scales_,
                weight_zero_points_,
                quantization_scheme_,
                row_block_indices_,
                col_block_indices_,
                weights_,
                output_channels_,
                input_channels_) = serialized[0]

            # Test Serialization
            self.assertEqual(bias_, bias)
            self.assertEqual(out_features_block_size_, row_block_size)
            self.assertEqual(in_features_block_size_, col_block_size)
            self.assertEqual(
                weight_scales_,
                [scales[0]] if is_per_tensor_quantized else scales)
            self.assertEqual(
                weight_zero_points_,
                [zero_points[0]] if is_per_tensor_quantized else zero_points)
            self.assertEqual(quantization_scheme_, is_per_tensor_quantized)
            self.assertEqual(row_block_indices_, expected_row_block_indices)
            self.assertEqual(col_block_indices_, expected_col_block_indices)
            self.assertEqual(
                weights_.tolist(),
                [v + 128
                 for v in expected_weights])  # weights are serialized as +128
            self.assertEqual(output_channels_, weight.shape[0])
            self.assertEqual(input_channels_, weight.shape[1])

            # Test Unpacking
            (weights_, bias_, out_features_block_size_,
             in_features_block_size_) = lin._weight_bias()
            self.assertEqual(torch.dequantize(weights_),
                             torch.dequantize(weight))
            self.assertEqual(bias_, bias)
            self.assertEqual(out_features_block_size_, row_block_size)
            self.assertEqual(in_features_block_size_, col_block_size)

            # Test Deserialization
            with tempfile.TemporaryFile() as file_buff:
                torch.save(lin, file_buff)
                file_buff.seek(0)
                lin2 = torch.load(file_buff)
                self.assertEqual(lin._weight_bias(), lin2._weight_bias())
                # Serialize -> Deserialize -> Serialize should match Serialize
                self.assertEqual(
                    serialized,
                    lin2._packed_params._packed_params.__getstate__())

                # Test that op output is preserved by serialize -> deserialize
                if qengine_is_qnnpack():
                    x = torch.rand(size=(1, weight.shape[1]))
                    y1 = lin(x)
                    y2 = lin2(x)
                    self.assertEqual(y1, y2)
示例#4
0
    def test_sparse_qlinear(self):
        batch_size = 12
        input_channels = 16
        output_channels = 4
        decimal_val = 4
        row_block_size = 1
        col_block_size = 4

        # X86 implementation of sparse ops in qnnpack only support
        # block pattern 1x4.
        # arm kernels have support for both 1x4 and 8x1.
        # This distinction is only because x86 implementations exist
        # only to enable testing of integration path.
        # We do plan to add 8x1 as well so that testing does not have to
        # special case like this. At the moment it is deprioritized due
        # to other higher priority works.
        if qengine_is_qnnpack() and not (row_block_size == 1
                                         and col_block_size == 4):
            return
        # ONEDNN does not support this yet
        if qengine_is_onednn():
            return

        dense_prepack = torch.ops.quantized.linear_prepack
        dense_qlinear = torch.ops.quantized.linear
        dense_qlinear_dynamic = torch.ops.quantized.linear_dynamic

        sparse_prepack = torch.ops.sparse.qlinear_prepack
        sparse_qlinear = torch.ops.sparse.qlinear
        sparse_qlinear_dynamic = torch.ops.sparse.qlinear_dynamic

        X_scale = 0.2
        X_zp = 2
        X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32)
        float_bias = torch.randn(output_channels, dtype=torch.float32)

        W_scales = torch.rand(output_channels, dtype=torch.float32)
        W_zps = torch.zeros(output_channels, dtype=torch.int32)
        W_fp32 = torch.randn(output_channels,
                             input_channels,
                             dtype=torch.float32)

        with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
            X_q = torch.quantize_per_tensor(X_fp32,
                                            scale=X_scale,
                                            zero_point=X_zp,
                                            dtype=torch.quint8)

            for use_channelwise, dynamic_mode in product([True, False],
                                                         [True, False]):
                if qengine_is_fbgemm() and dynamic_mode:
                    logging.info(
                        "dynamic sparse qlinear is only available in qnnpack")
                    continue
                if qengine_is_qnnpack() and not dynamic_mode:
                    logging.info(
                        "static sparse qlinear is only available in fbgemm")
                    continue
                if use_channelwise:
                    W_q = torch.quantize_per_channel(W_fp32,
                                                     scales=W_scales,
                                                     zero_points=W_zps,
                                                     axis=0,
                                                     dtype=torch.qint8)
                else:
                    W_q = torch.quantize_per_tensor(W_fp32,
                                                    scale=W_scales[0],
                                                    zero_point=W_zps[0],
                                                    dtype=torch.qint8)

                Y_scale = 1.1234
                Y_zp = 5
                W_prepack_dense = dense_prepack(W_q, float_bias)
                W_prepack_sparse = sparse_prepack(W_q, float_bias,
                                                  row_block_size,
                                                  col_block_size)

                if dynamic_mode:
                    Y = sparse_qlinear_dynamic(X_fp32, W_prepack_sparse)
                    Y_ref = dense_qlinear_dynamic(X_fp32, W_prepack_dense)

                    np.testing.assert_array_almost_equal(Y_ref.numpy(),
                                                         Y.numpy(),
                                                         decimal=decimal_val)
                else:
                    Y_q = sparse_qlinear(X_q, W_prepack_sparse, Y_scale, Y_zp)
                    Y_q_ref = dense_qlinear(X_q, W_prepack_dense, Y_scale,
                                            Y_zp)

                    np.testing.assert_array_almost_equal(
                        Y_q_ref.int_repr().numpy(),
                        Y_q.int_repr().numpy(),
                        decimal=decimal_val)
示例#5
0
def _sparse_layer_test_helper(
    model_class,
    sparse_mapping,
    ref_mapping,
    qconfig_dict,
    fqn_to_check,
    test_class,
    test_scripting,
):
    # SET UP TEST PARAMETERS, INPUTS AND WEIGHTS
    # ------------------------------------------
    batch_size = 12
    input_channels = 4
    output_channels = 7
    model = model_class(input_channels, output_channels)

    # For sparse kernels both the activation and weight ZP = 0
    X_scale = 0.2
    X_zp = 2
    W_scale = 1e-2
    W_zp = 0

    X_fp32 = torch.randn(batch_size, input_channels, dtype=torch.float32)
    float_bias = torch.randn(output_channels, dtype=torch.float32)

    # generate a weight which we'll insert into the model
    W_fp32 = torch.randn(output_channels, input_channels, dtype=torch.float32)
    mask = torch.randint(0, 2, W_fp32.shape)
    W_fp32 *= mask
    with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
        X_q = torch.quantize_per_tensor(X_fp32,
                                        scale=X_scale,
                                        zero_point=X_zp,
                                        dtype=torch.quint8)
        X_fp32 = X_q.dequantize()

        W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8)

        # PREPARE MODELS FOR QUANTIZATION
        # -------------------------------
        model.linear.weight = nn.Parameter(W_q.dequantize())
        model.eval()

        # Add `sparse_params` to the model. The test for correct
        # sparse_param addition is in the sparsifier tests
        model.linear.sparse_params = {"sparse_block_shape": (1, 4)}

        # generate model versions
        qmodel = copy.deepcopy(model)
        sqmodel = copy.deepcopy(model)

        # generate model versions and apply qconfigs
        tq.propagate_qconfig_(qmodel, qconfig_dict)
        tq.propagate_qconfig_(sqmodel, qconfig_dict)

        tq.prepare(qmodel, inplace=True)
        tq.prepare(sqmodel, inplace=True)

        # calibrate
        with torch.no_grad():
            qmodel(X_fp32)
            sqmodel(X_fp32)

        # ACTUAL TESTING BEGINS HERE
        # --------------------------

        # Make sure the quantization parameters are computed the same way
        qparams = qmodel.linear.qconfig.weight().calculate_qparams()
        sqparams = sqmodel.linear.qconfig.weight().calculate_qparams()
        test_class.assertEqual(qparams, sqparams)

        sqmodule_to_check = fqn_to_module(sqmodel, fqn_to_check)
        sqmodule_start_class = sqmodule_to_check.__class__
        sqmodule_expected_converted_class = sparse_mapping[
            sqmodule_start_class]

        qmodule_to_check = fqn_to_module(qmodel, fqn_to_check)
        qmodule_start_class = qmodule_to_check.__class__
        qmodule_expected_converted_class = ref_mapping[qmodule_start_class]

        # need to determine whether dynamic quantization is being performed since
        # input dtype will be different at the end
        is_dynamic = isinstance(qmodule_to_check.activation_post_process,
                                tq.PlaceholderObserver)

        tq.convert(sqmodel, inplace=True, mapping=sparse_mapping)
        tq.convert(qmodel, inplace=True, mapping=ref_mapping)

        # this code is a duplicate of above since the references do not
        # update to the post-convert modules
        sqmodule_to_check = fqn_to_module(sqmodel, fqn_to_check)
        qmodule_to_check = fqn_to_module(qmodel, fqn_to_check)

        # check that the modules were converted as expected
        assert isinstance(sqmodule_to_check,
                          sqmodule_expected_converted_class), "Convert failed"
        assert isinstance(qmodule_to_check,
                          qmodule_expected_converted_class), "Mapping failed"

        row_block_size, col_block_size = sqmodel.linear._packed_params._weight_bias(
        )[2:]
        assert row_block_size == 1 and col_block_size == 4

        # only run during serialization/deserialization tests
        # makes sure script/save/load doesn't malform the sqmodel
        if test_scripting:
            scripted_sqmodel = torch.jit.script(sqmodel)
            scripted_sqmodel.eval()
            buffer = io.BytesIO()
            torch.jit.save(scripted_sqmodel, buffer)
            buffer.seek(0)
            sqmodel = torch.jit.load(buffer)

        # use correct input dtype
        if is_dynamic:
            Y_ref = qmodel(X_fp32)
            Y_hat = sqmodel(X_fp32)
            test_class.assertEqual(Y_ref, Y_hat)
        else:
            Y_ref = qmodel(X_q)
            Y_hat = sqmodel(X_q)
            test_class.assertEqual(Y_ref.dequantize(), Y_hat.dequantize())
示例#6
0
    def test_sparse_qlinear_serdes(self):
        batch_size = 12
        input_channels = 4
        output_channels = 7
        model = self.SparseQuantizedModel(input_channels, output_channels)

        # For sparse kernels both the activation and weight ZP = 0
        X_scale = 0.2
        X_zp = 0
        W_scale = 1e-2
        W_zp = 0

        with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
            X_fp32 = torch.randn(batch_size,
                                 input_channels,
                                 dtype=torch.float32)
            float_bias = torch.randn(output_channels, dtype=torch.float32)

            X_q = torch.quantize_per_tensor(X_fp32,
                                            scale=X_scale,
                                            zero_point=X_zp,
                                            dtype=torch.quint8)
            X_fp32 = X_q.dequantize()

            W_fp32 = torch.randn(output_channels,
                                 input_channels,
                                 dtype=torch.float32)
            mask = torch.randint(0, 2, W_fp32.shape)
            W_fp32 *= mask
            W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8)

            model.linear.weight = nn.Parameter(W_q.dequantize())
            model.linear.sparse_params = {'sparse_block_shape': (1, 4)}
            model.eval()

            # Note: At the moment, for sparse kernels
            # fbgemm supports only static quantized sparse linear
            # qnnpack supports only dynamically quantized sparse linear
            # Hence we have two different tests.
            # fbgemm tests static flow, qnnpack tests dynamic.
            # Should be unified later on and tests should be fixed
            # appropriately.
            if qengine_is_fbgemm():
                model.qconfig = tq.get_default_qconfig('fbgemm')
                qmodel = copy.deepcopy(model)
                sqmodel = copy.deepcopy(model)

                tq.prepare(qmodel, inplace=True)
                tq.prepare(sqmodel, inplace=True)

                with torch.no_grad():
                    qmodel(X_fp32)
                    sqmodel(X_fp32)

                # Make sure the quantization parameters are computed the same way
                qparams = qmodel.linear.qconfig.weight().calculate_qparams()
                sqparams = sqmodel.linear.qconfig.weight().calculate_qparams()
                self.assertEqual(qparams, sqparams)

                # Make sure mapping of sparse kernels does not affect the non-sparse
                sparse_mapping = tq.get_default_static_quant_module_mappings()
                sparse_mapping[nn.Linear] = ao_nn_sq.Linear
                tq.convert(sqmodel, inplace=True, mapping=sparse_mapping)
                tq.convert(qmodel, inplace=True)

                assert isinstance(sqmodel.linear,
                                  ao_nn_sq.Linear), "Convert failed"
                assert isinstance(qmodel.linear,
                                  nn.quantized.Linear), "Mapping failed"

                scripted_sqmodel = torch.jit.script(sqmodel)
                scripted_sqmodel.eval()
                buffer = io.BytesIO()
                torch.jit.save(scripted_sqmodel, buffer)
                buffer.seek(0)
                sqmodel = torch.jit.load(buffer)

                # Make sure numerics are right
                Y_ref = qmodel(X_q)
                Y_hat = sqmodel(X_q)
                self.assertEqual(Y_ref.dequantize(), Y_hat.dequantize())

            elif qengine_is_qnnpack():
                qconfig = {nn.Linear: tq.qconfig.default_dynamic_qconfig}
                dqmodel = copy.deepcopy(model)
                sdqmodel = copy.deepcopy(model)

                tq.propagate_qconfig_(dqmodel, qconfig)
                tq.propagate_qconfig_(sdqmodel, qconfig)

                # Make sure the quantization parameters are computed the same way
                qparams = dqmodel.linear.qconfig.weight().calculate_qparams()
                sqparams = sdqmodel.linear.qconfig.weight().calculate_qparams()
                self.assertEqual(qparams, sqparams)

                # Make sure mapping of sparse kernels does not affect the non-sparse
                sparse_mapping = copy.deepcopy(
                    tq.get_default_dynamic_quant_module_mappings())
                sparse_mapping[nn.Linear] = ao_nn_sq.dynamic.Linear
                with LinearBlockSparsePattern(1, 4):
                    tq.convert(sdqmodel, inplace=True, mapping=sparse_mapping)
                tq.convert(
                    dqmodel,
                    mapping=tq.get_default_dynamic_quant_module_mappings(),
                    inplace=True)

                assert isinstance(sdqmodel.linear,
                                  ao_nn_sq.dynamic.Linear), "Convert failed"
                assert isinstance(
                    dqmodel.linear,
                    nn.quantized.dynamic.Linear), "Mapping failed"

                scripted_sdqmodel = torch.jit.script(sdqmodel)
                scripted_sdqmodel.eval()
                buffer = io.BytesIO()
                torch.jit.save(scripted_sdqmodel, buffer)
                buffer.seek(0)
                sdqmodel = torch.jit.load(buffer)

                # Make sure numerics are right
                Y_ref = dqmodel(X_fp32)
                Y_hat = sdqmodel(X_fp32)
                self.assertEqual(Y_ref, Y_hat)