def test_dnnlowp_dequantize(self, size, gc, dc):
        min_ = -10.0
        max_ = 20.0
        X = (np.random.rand(size) * (max_ - min_) + min_).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_type_list = ["Dequantize", "Int8Dequantize"]
        engine = "DNNLOWP"

        outputs.append(Output(X, op_type="", engine=""))

        for op_type in op_type_list:
            net = core.Net("test_net")

            quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                           engine=engine,
                                           device_option=gc)
            net.Proto().op.extend([quantize])

            dequantize = core.CreateOperator(op_type, ["X_q"], ["Y"],
                                             engine=engine,
                                             device_option=gc)
            net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
    def test_dnnlowp_gather(self, dim1, dim2, is_empty, in_quantized,
                            out_quantized, gc, dc):
        if is_empty:
            dim2 = 0
        # FIXME : DNNLOWP Gather doesn't support quantized input and
        # dequantized output
        if in_quantized:
            out_quantized = True

        data = (np.random.rand(dim1) * 2 - 1).astype(np.float32)
        index = np.floor(np.random.rand(dim2) * dim1).astype(np.int32)

        Output = collections.namedtuple("Output", ["out", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("Gather", ""),
            ("Gather", "DNNLOWP"),
            ("Int8Gather", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize_data = core.CreateOperator("Quantize", ["data"],
                                                    ["data_q"],
                                                    engine=engine,
                                                    device_option=gc)
                net.Proto().op.extend([quantize_data])

            gather = core.CreateOperator(
                op_type,
                ["data_q" if do_quantize else "data", "index"],
                ["out_q" if do_dequantize else "out"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([gather])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["out_q"],
                                                 ["out"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("data").feed(data, device_option=gc)
            self.ws.create_blob("index").feed(index, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(out=self.ws.blobs["out"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs, ref=data)
    def test_dnnlowp_relu(self, size, gc, dc):
        min_ = -10.
        max_ = 10.
        scale = (max_ - min_) / 255
        zero_point = int(np.round(-min_ / scale))
        X = (np.random.rand(size) * (max_ - min_) + min_).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("Relu", ""),
            ("Relu", "DNNLOWP"),
            ("Int8Relu", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            if engine == "DNNLOWP":
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    engine=engine,
                    device_option=gc,
                    Y_scale=scale,
                    Y_zero_point=zero_point,
                )
                net.Proto().op.extend([quantize])

            relu = core.CreateOperator(
                op_type,
                ["X_q" if engine == "DNNLOWP" else "X"],
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([relu])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        # Y = max(0, X) so the only error is quantization of inputs
        check_quantized_results_close(outputs, ref=X)
示例#4
0
    def test_dnnlowp_elementwise_sum_int(self, N, M, gc, dc):
        # All inputs have scale 1, so exactly represented after quantization
        inputs = M * [None]
        X_names = M * [None]
        X_q_names = M * [None]

        for i in range(M):
            X = np.random.randint(-128, 127, N, np.int8).astype(np.float32)
            X[0] = -128
            X[-1] = 127
            inputs[i] = X
            X_names[i] = chr(ord("A") + i)
            X_q_names[i] = X_names[i] + "_q"

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("Sum", ""), ("Sum", "DNNLOWP"),
                          ("Int8Sum", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            if engine == "DNNLOWP":
                for i in range(M):
                    quantize = core.CreateOperator(
                        "Quantize",
                        X_names[i],
                        X_q_names[i],
                        engine=engine,
                        device_option=gc,
                    )
                    net.Proto().op.extend([quantize])

            sum_ = core.CreateOperator(
                op_type,
                X_q_names if engine == "DNNLOWP" else X_names,
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([sum_])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            for i in range(M):
                self.ws.create_blob(X_names[i]).feed(X, device_option=gc)

            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
    def test_dnnlowp_elementwise_add_broadcast_axis(self, gc, dc):
        for bdim, axis in [
            ((3, 4), 1),  # broadcasting intermediate dimensions
            ((2, ), 0),  # broadcasting the first dimension
            ((1, 4, 1), 1)
        ]:
            # broadcasting with single elem dimensions at both ends

            min_ = -100
            max_ = min_ + 255
            A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_)
            A = A.astype(np.float32)
            B = np.round(np.random.rand(*bdim) * 255 / 2 - 64).astype(
                np.float32)

            A.flat[0] = min_
            A.flat[1] = max_
            B.flat[0] = -64
            B.flat[1] = 127. / 2

            Output = collections.namedtuple("Output",
                                            ["Y", "op_type", "engine"])
            outputs = []

            op_engine_list = [
                ("Add", ""),
                ("Add", "DNNLOWP"),
                ("Int8Add", "DNNLOWP"),
            ]

            for op_type, engine in op_engine_list:
                net = core.Net("test_net")

                add = core.CreateOperator(
                    op_type,
                    ['A', 'B'],
                    ['Y'],
                    engine=engine,
                    device_option=gc,
                    broadcast=1,
                    axis=axis,
                    dequantize_output=1,
                )
                net.Proto().op.extend([add])

                self.ws.create_blob('A').feed(A, device_option=gc)
                self.ws.create_blob('B').feed(B, device_option=gc)
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(),
                           op_type=op_type,
                           engine=engine))

            check_quantized_results_close(outputs)
示例#6
0
    def test_dnnlowp_elementwise_mul_broadcast_axis(self, gc, dc):
        for bdim, axis in [
            ((3, 4), 1),  # broadcasting intermediate dimensions
            ((2, ), 0),  # broadcasting the first dimension
            ((1, 4, 1), 1),
        ]:
            # broadcasting with single elem dimensions at both ends

            min_ = -100
            max_ = min_ + 255
            A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_)
            A = A.astype(np.float32)

            B = np.round(np.random.rand(*bdim) * 255 - 128).astype(np.float32)

            A.flat[0] = min_
            A.flat[1] = max_
            B.flat[0] = -128
            B.flat[1] = 127

            Output = collections.namedtuple("Output", ["Y", "engine"])
            outputs = []

            engine_list = ["", "DNNLOWP"]
            for engine in engine_list:
                net = core.Net("test_net")

                mul = core.CreateOperator(
                    "Mul",
                    ["A", "B"],
                    ["Y"],
                    engine=engine,
                    device_option=gc,
                    broadcast=1,
                    axis=axis,
                    dequantize_output=1,
                )
                net.Proto().op.extend([mul])

                self.ws.create_blob("A").feed(A, device_option=gc)
                self.ws.create_blob("B").feed(B, device_option=gc)
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(), engine=engine))

            check_quantized_results_close(outputs)
    def test_dnnlowp_elementwise_add_broadcast(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        min_ = -100
        max_ = min_ + 255
        A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_)
        A = A.astype(np.float32)
        A[0, 0, 0, 0] = min_
        A[0, 0, 0, 1] = max_

        B = np.round(np.random.rand(4, 5) * 255 / 2 - 64).astype(np.float32)
        B[0, 0] = -64
        B[0, 1] = 127. / 2

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("Add", ""),
            ("Add", "DNNLOWP"),
            ("Int8Add", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            add = core.CreateOperator(
                op_type,
                ['A', 'B'],
                ['Y'],
                engine=engine,
                device_option=gc,
                broadcast=1,
                dequantize_output=1,
            )
            net.Proto().op.extend([add])

            self.ws.create_blob('A').feed(A, device_option=gc)
            self.ws.create_blob('B').feed(B, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
    def test_dnnlowp_elementwise_mul_broadcast(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        min_ = -100
        max_ = min_ + 255
        A = np.round(np.random.rand(2, 3, 4, 5) * (max_ - min_) + min_)
        A = A.astype(np.float32)
        A[0, 0, 0, 0] = min_
        A[0, 0, 0, 1] = max_

        B = np.round(np.random.rand(4, 5) * 255 - 128).astype(np.float32)
        B[0, 0] = -128
        B[0, 1] = 127

        Output = collections.namedtuple("Output", ["Y", "engine"])
        outputs = []

        engine_list = ['', 'DNNLOWP']
        for engine in engine_list:
            net = core.Net("test_net")

            mul = core.CreateOperator(
                "Mul",
                ['A', 'B'],
                ['Y'],
                engine=engine,
                device_option=gc,
                broadcast=1,
                dequantize_output=1,
            )
            net.Proto().op.extend([mul])

            self.ws.create_blob('A').feed(A, device_option=gc)
            self.ws.create_blob('B').feed(B, device_option=gc)
            self.ws.run(net)
            outputs.append(Output(
                Y=self.ws.blobs["Y"].fetch(), engine=engine))

        check_quantized_results_close(outputs)
    def test_groupwise_dnnlowp_conv_relu_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            True,  # group-wise
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("ConvRelu", "DNNLOWP"),
            ("ConvRelu", "DNNLOWP_16"),
            ("Int8ConvRelu", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            if "DNNLOWP" in engine:
                quantize = core.CreateOperator(
                    "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([quantize])

                conv = core.CreateOperator(
                    op_type,
                    ["X_q", "W", "b"],
                    ["Y_q"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    group=group,
                    quantize_groupwise=1,
                    device_option=gc,
                )
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
                net.Proto().op.extend([conv])

                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])
            else:
                conv = core.CreateOperator(
                    op_type,
                    ["X", "W", "b"],
                    ["Y"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    group=group,
                    device_option=gc,
                )
                net.Proto().op.extend([conv])

                relu = core.CreateOperator(
                    "Relu", ["Y"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([relu])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)
示例#10
0
    def test_groupwise_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        X_min = -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        if batch_size != 0:
            X[0, 0, 0, 1] = X_max

        W_min = -100
        W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[..., 1] = W_min + 128  # "zeros"
        for g in range(group):
            W[g * output_channels_per_group, 0, 0, 0] = W_min
            W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
            W[g * output_channels_per_group:(g + 1) *
              output_channels_per_group, ] += g

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine="DNNLOWP",
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                X_min = 0 if X.size == 0 else X.min()
                X_max = 0 if X.size == 0 else X.max()
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X_min, X_max)
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    engine=engine,
                    group=group,
                    quantize_groupwise=1,
                    in_scale=x_q_param.scale,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else "W",
                    "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine,
                           order, gc, outputs)

        check_quantized_results_close(outputs)
    def _test_dnnlowp_nd_int(
        self,
        stride,
        pad,
        kernels,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        ndim = len(kernels)

        X, W, b = generate_convnd_inputs(
            (stride, ) * ndim,
            (pad, ) * ndim,
            kernels,
            (dilation, ) * ndim,
            (size, ) * ndim,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"),
                          ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC"

            if fall_back_to_NCHW:
                X_nchw = nhwc2nchw(X)
                W_nchw = nhwc2nchw(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * ndim,
                kernels=kernels,
                dilations=[dilation] * ndim,
                pads=[pad] * (ndim * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                dequantize_output=not do_dequantize,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = nchw2nhwc(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)
    def test_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (
            np.random.rand(output_channels, kernel, kernel, input_channels_per_group)
            * 4
            - 2
            + W_min
            + 128
        )
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (
                "DNNLOWP" in engine and weight_quantized and len(outputs) > 0
            )

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity
                )
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
示例#13
0
    def test_dnnlowp_depthwise_3x3x3_conv(
        self,
        stride,
        size,
        group,
        batch_size,
        prepack_weight,
        fuse_relu,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        pad = 1
        kernel = 3
        dilation = 1
        input_channels_per_group = 1
        output_channels_per_group = 1
        order = "NHWC"

        X, W, b = generate_convnd_inputs(
            (stride, ) * 3,
            (pad, ) * 3,
            (kernel, ) * 3,
            (dilation, ) * 3,
            (size, ) * 3,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op = "ConvRelu" if fuse_relu else "Conv"
        op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            # TODO: no fall back to NCHW
            fall_back_to_NCHW = "DNNLOWP" not in engine

            if fall_back_to_NCHW:
                X_nchw = nhwc2nchw(X)
                W_nchw = nhwc2nchw(W)
            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity)
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * 3,
                kernels=[kernel] * 3,
                dilations=[dilation] * 3,
                pads=[pad] * (3 * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = nchw2nhwc(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
示例#14
0
    def test_dnnlowp_batch_matmul_int_constant_B(
        self, m, n, k, C_1, C_2, A_quantized, B_quantized, out_quantized, gc, dc
    ):
        batch_dims = tuple(np.random.randint(3, size=max(C_1, C_2)))
        batch_dims_A = batch_dims[-C_1:]
        batch_dims_B = batch_dims[-C_2:]
        A = np.zeros(batch_dims_A + (m, k)).astype(np.float32)
        B = np.zeros(batch_dims_B + (n, k)).astype(np.float32)

        if np.prod(batch_dims) > 0:
            for index in np.ndindex(batch_dims_A):
                # When both input and output are float, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error
                # A_min = -77 if (A_quantized or out_quantized) else -77 + i
                A_min = -77
                A_max = A_min + 255
                A[index] = np.round(np.random.rand(m, k) * 255 + A_min)
                # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw
                # when multiplied with B_min and B_max
                A[index][:, 0] = A_min
                A[index][0, 1] = A_max

            i = 0
            for index in np.ndindex(batch_dims_B):
                # When weight is quantized in a lazy manner, each input of the batch has
                # scale 1 but with different offset, so input-wise quantization
                # shouldn't have any input quantization error when weight is quantized
                # in a lazy manner.
                B_min = -100 if B_quantized else -100 + i
                # B_min = -100
                B_max = B_min + 255
                B[index] = np.round(np.random.rand(n, k) * 255 + B_min)
                B[index][0, 0] = B_min
                B[index][1, 0] = B_max

                if C_1 > C_2:
                    # A has more dims
                    for outer_index in np.ndindex(batch_dims_A[: C_1 - C_2]):
                        avoid_vpmaddubsw_overflow_fc(
                            m,
                            k,
                            n,
                            A[outer_index] if C_2 == 0 else A[outer_index + index],
                            A_min,
                            A_max,
                            B[index],
                            B_min,
                            B_max,
                        )
                else:
                    avoid_vpmaddubsw_overflow_fc(
                        m, k, n, A[index[-C_1:]], A_min, A_max, B[index], B_min, B_max
                    )
                i += 1

        for trans_a, trans_b in product([0, 1], [0, 1]):
            Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
            outputs = []

            op_engine_list = [
                ("BatchMatMul", ""),
                ("BatchMatMul", "DNNLOWP"),
                ("Int8BatchMatMul", "DNNLOWP"),
            ]

            for op_type, engine in op_engine_list:
                net = core.Net("test_net")

                do_quantize_A = "DNNLOWP" in engine and A_quantized
                do_quantize_B = "DNNLOWP" in engine and B_quantized
                do_dequantize = "DNNLOWP" in engine and out_quantized

                if do_quantize_A:
                    quantize_A = core.CreateOperator(
                        "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([quantize_A])

                if do_quantize_B:
                    int8_given_tensor_fill, B_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                        B if trans_b else B.swapaxes(-1, -2), "B_q"
                    )
                    net.Proto().op.extend([int8_given_tensor_fill])

                batch_matmul = core.CreateOperator(
                    op_type,
                    ["A_q" if do_quantize_A else "A", "B_q" if do_quantize_B else "B"],
                    ["Y_q" if do_dequantize else "Y"],
                    trans_a=trans_a,
                    trans_b=trans_b,
                    broadcast=True,
                    constant_B=True,
                    dequantize_output=not do_dequantize,
                    engine=engine,
                    device_option=gc,
                )
                if do_quantize_B:
                    # When quantized weight is provided, we can't rescale the
                    # output dynamically by looking at the range of output of each
                    # batch, so here we provide the range of output observed from
                    # fp32 reference implementation
                    dnnlowp_utils.add_quantization_param_args(
                        batch_matmul, outputs[0][0]
                    )
                net.Proto().op.extend([batch_matmul])

                if do_dequantize:
                    dequantize = core.CreateOperator(
                        "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([dequantize])

                self.ws.create_blob("A").feed(
                    A.swapaxes(-1, -2) if trans_a else A, device_option=gc
                )
                self.ws.create_blob("B").feed(
                    B if trans_b else B.swapaxes(-1, -2), device_option=gc
                )
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)
                )

            if np.prod(batch_dims) > 0:
                check_quantized_results_close(outputs)
    def test_dnnlowp_elementwise_mul_int(self, N,
                                         in_quantized, out_quantized, in_place,
                                         gc, dc):
        # FIXME: DNNLOWP Mul doesn't support inplace operation and
        # dequantize_output=1 at the same time
        if in_place[0] or in_place[1]:
            in_quantized = True
            out_quantized = True

        # All inputs have scale 1, so exactly represented after quantization
        min_ = -100
        max_ = min_ + 255
        A = np.round(np.random.rand(N) * (max_ - min_) + min_)
        A = A.astype(np.float32)
        A[0] = min_
        A[1] = max_

        B = np.round(np.random.rand(N) * 255 - 128).astype(np.float32)
        B[0] = -128
        B[1] = 127

        Output = collections.namedtuple("Output", ["Y", "engine"])
        outputs = []

        engine_list = ['', 'DNNLOWP']
        for engine in engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize_A = core.CreateOperator(
                    "Quantize",
                    ['A'], ['A_q'],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize_A])

                quantize_B = core.CreateOperator(
                    "Quantize",
                    ['B'], ['B_q'],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize_B])

            out = 'Y'
            if in_place[0]:
                out = 'A'
            elif in_place[1]:
                out = 'B'

            mul = core.CreateOperator(
                "Mul",
                ['A_q', 'B_q'] if do_quantize else ['A', 'B'],
                [(out + '_q') if do_dequantize else out],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([mul])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    [out + '_q'],
                    [out],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob('A').feed(A, device_option=gc)
            self.ws.create_blob('B').feed(B, device_option=gc)
            self.ws.run(net)
            outputs.append(Output(Y=self.ws.blobs[out].fetch(), engine=engine))

        check_quantized_results_close(outputs)
示例#16
0
    def test_dnnlowp_spatial_bn_int(
        self,
        size,
        input_channels,
        output_channels,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        gc,
        dc,
    ):
        X_min = -77
        X_max = X_min + 255
        X = np.round(np.random.rand(batch_size, size, size,
                                    input_channels)).astype(np.float32)
        X[0, 0, 0, 0] = X_min
        X[0, 0, 0, 1] = X_max

        epsilon = np.abs(np.random.rand())
        scale = np.random.rand(input_channels).astype(np.float32)
        bias = np.random.rand(input_channels).astype(np.float32)
        mean = np.random.rand(input_channels).astype(np.float32)
        var = np.random.rand(input_channels).astype(np.float32)

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("SpatialBN", ""),
            ("SpatialBN", "DNNLOWP"),
            ("Int8SpatialBN", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine)
                net.Proto().op.extend([quantize])

            bn = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X", "scale", "bias", "mean",
                    "var"
                ],
                ["Y_q" if do_dequantize else "Y"],
                is_test=True,
                epsilon=epsilon,
                order=order,
                engine=engine,
                dequantize_output=not do_dequantize,
            )
            net.Proto().op.extend([bn])
            if "DNNLOWP" in engine:
                dnnlowp_utils.add_quantization_param_args(bn, outputs[0][0])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("scale").feed(scale, device_option=gc)
            self.ws.create_blob("bias").feed(bias, device_option=gc)
            self.ws.create_blob("mean").feed(mean, device_option=gc)
            self.ws.create_blob("var").feed(var, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
    def test_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        weight_quantized,
        prepack_weight,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        fuse_relu,
        output_packed_bias,
        gc,
        dc,
    ):
        # X and W have scale 1, so exactly represented after quantization
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min
        )
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0] = X_min
        if batch_size != 0:
            X[0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = np.round(
            np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min
        )
        W = W.astype(np.float32)
        W[0, 0] = W_min
        W[1, 0] = W_max

        # Make sure we won't have overflows from vpmaddubsw instruction used in
        # fbgemm
        avoid_vpmaddubsw_overflow_fc(
            batch_size,
            input_channels,
            output_channels,
            X,
            X_min,
            X_max,
            W,
            W_min,
            W_max,
        )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("FC", "")]
        if fuse_relu:
            op_engine_list += [("Int8FCRelu", "DNNLOWP")]
        else:
            op_engine_list += [
                ("FC", "DNNLOWP"),
                ("FC", "DNNLOWP_16"),
                ("Int8FC", "DNNLOWP"),
            ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (
                engine == "DNNLOWP" and weight_quantized and len(outputs) > 0
            )
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            X_min = 0 if X.size == 0 else X.min()
            X_max = 0 if X.size == 0 else X.max()
            x_q_param = dnnlowp_utils.choose_quantization_params(
                X_min, X_max, preserve_activation_sparsity
            )
            w_q_param = None
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8FCPackWeight",
                    inputs,
                    ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"],
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            fc = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed"
                    if do_prepack_weight
                    else ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                device_option=gc,
            )
            if do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    fc, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([fc])
            if fuse_relu and "DNNLOWP" not in engine:
                net.Relu(["Y"], "Y")

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                )
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(
                self, init_net, net, X, W, b, op_type, engine, None, gc, outputs
            )

            if output_packed_bias and do_prepack_weight and do_dequantize:
                bias_int32 = self.ws.blobs["B_q32"].fetch()
                if do_quantize_weight:
                    np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale)))
                np.testing.assert_equal(bias_int32[0].dtype, np.int32)

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
示例#18
0
    def test_dnnlowp_batch_matmul_int(self, m, n, k, batch_size, gc, dc):
        # A and B have scale 1, so exactly represented after quantization
        A_min = -77
        A_max = A_min + 255
        A = np.round(np.random.rand(batch_size, m, k) * 255 + A_min)
        A = A.astype(np.float32)
        # input channels 0 and 1 are all A_min to avoid overflow from vpmaddubsw
        # when multiplied with B_min and B_max
        A[0, :, 0] = A_min
        A[0, 0, 1] = A_max

        B_min = -100
        B_max = B_min + 255
        B = np.round(np.random.rand(batch_size, n, k) * 255 + B_min)
        B = B.astype(np.float32)
        B[0, 0, 0] = B_min
        B[0, 1, 0] = B_max

        for i in range(batch_size):
            avoid_vpmaddubsw_overflow_fc(
                m, k, n, A[i,], A_min, A_max, B[i,], B_min, B_max
            )

        for trans_a, trans_b in product([0, 1], [0, 1]):
            Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
            outputs = []

            op_engine_list = [
                ("BatchMatMul", ""),
                ("BatchMatMul", "DNNLOWP"),
                ("BatchMatMul", "DNNLOWP_16"),
                ("Int8BatchMatMul", "DNNLOWP"),
            ]

            for op_type, engine in op_engine_list:
                net = core.Net("test_net")

                if "DNNLOWP" in engine:
                    quantize_A = core.CreateOperator(
                        "Quantize", ["A"], ["A_q"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([quantize_A])

                    quantize_B = core.CreateOperator(
                        "Quantize", ["B"], ["B_q"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([quantize_B])

                batch_matmul = core.CreateOperator(
                    op_type,
                    [
                        "A_q" if "DNNLOWP" in engine else "A",
                        "B_q" if "DNNLOWP" in engine else "B",
                    ],
                    ["Y_q" if "DNNLOWP" in engine else "Y"],
                    trans_a=trans_a,
                    trans_b=trans_b,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([batch_matmul])

                if "DNNLOWP" in engine:
                    dequantize = core.CreateOperator(
                        "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc
                    )
                    net.Proto().op.extend([dequantize])

                self.ws.create_blob("A").feed(
                    np.transpose(A, (0, 2, 1)) if trans_a else A, device_option=gc
                )
                self.ws.create_blob("B").feed(
                    B if trans_b else np.transpose(B, (0, 2, 1)), device_option=gc
                )
                self.ws.run(net)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(), op_type=op_type, engine=engine)
                )

            check_quantized_results_close(outputs)
示例#19
0
    def test_dnnlowp_depthwise_3x3_conv(
        self,
        stride,
        size,
        group,
        batch_size,
        prepack_weight,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        relu,
        gc,
        dc,
    ):
        pad = 1
        kernel = 3
        dilation = 1
        input_channels_per_group = 1
        output_channels_per_group = 1
        order = "NHWC"

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        if relu:
            op_engine_list = [
                ("Conv", ""),
                ("ConvRelu", "DNNLOWP"),
                ("Int8ConvRelu", "DNNLOWP"),
            ]
        else:
            op_engine_list = [
                ("Conv", ""),
                ("Conv", "DNNLOWP"),
                ("Int8Conv", "DNNLOWP"),
            ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity)
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])
            elif relu:
                relu_op = core.CreateOperator("Relu", ["Y"], ["Y"],
                                              engine=engine,
                                              device_option=gc)
                net.Proto().op.extend([relu_op])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
    def test_rowwise_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        gc,
        dc,
    ):
        print("@given M ", batch_size, " K ", input_channels, " N ",
              output_channels)
        print("@given in_quantized ", in_quantized, " out_quantized ",
              out_quantized)

        # X has scale 1, so exactly represented after quantization
        X_min = -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) +
            X_min)
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0:2] = X_min
        X[0, 2] = X_max

        # Each row of W has scale 1 but with different offset, so row-wise
        # quantization shouldn't have any input quantization error.
        W = np.zeros((output_channels, input_channels))
        W = W.astype(np.float32)
        for i in range(output_channels):
            W_min = -100 + i
            W_max = W_min + 255
            W[i, :] = np.round(
                np.random.rand(input_channels) * (W_max - W_min) + W_min)
            W[i, 0] = W_min
            W[i, 1] = W_max

            # Make sure we won't have overflows from vpmaddubsw instruction used in
            # fbgemm
            avoid_vpmaddubsw_overflow_fc(
                batch_size,
                input_channels,
                1,
                X,
                X_min,
                X_max,
                W[i:i + 1, ],
                W_min,
                W_max,
            )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("FC", ""),
            ("FC", "DNNLOWP_ROWWISE"),
            ("FC", "DNNLOWP_ROWWISE_16"),
            ("Int8FC", "DNNLOWP_ROWWISE"),
            ("Int8FCRowWise", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            fc = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([fc])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
    def test_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
            )
        else:
            X_min = 0 if preserve_activation_sparsity else -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            if preserve_weight_sparsity:
                W_min = -128
                W_max = 100
            else:
                W_min = -100
                W_max = W_min + 255
            W = (
                np.random.rand(
                    output_channels, kernel, kernel, input_channels_per_group
                )
                * 4
                - 2
                + W_min
                + 128
            )
            W = np.round(W).astype(np.float32)
            W[0, 0, 0, 0] = W_min
            W[1, 0, 0, 0] = W_max
            W[..., 1] = W_min + 128

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = "DNNLOWP" in engine and weight_quantized
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max(), preserve_activation_sparsity
            )
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed"
                    if do_prepack_weight
                    else ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
示例#22
0
    def test_dnnlowp_group_norm(self, N, G, K, H, W, order, in_quantized,
                                out_quantized, weight_quantized, gc, dc):
        C = G * K

        X = np.random.rand(N, C, H, W).astype(np.float32) * 5.0 - 1.0
        if order == "NHWC":
            X = np.transpose(X, [0, 2, 3, 1])
        gamma = np.random.rand(C).astype(np.float32) * 2.0 - 1.0
        beta = np.random.randn(C).astype(np.float32) - 0.5

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [("GroupNorm", ""), ("GroupNorm", "DNNLOWP"),
                          ("Int8GroupNorm", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, gamma_q_param = (
                    dnnlowp_utils.create_int8_given_tensor_fill(
                        gamma, "gamma_q"))
                net.Proto().op.extend([int8_given_tensor_fill])

                X_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = (
                    dnnlowp_utils.create_int8_bias_tensor_fill(
                        beta, "beta_q", X_q_param, gamma_q_param))
                net.Proto().op.extend([int8_bias_tensor_fill])

            group_norm = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "gamma_q" if do_quantize_weight else "gamma",
                    "beta_q" if do_quantize_weight else "beta"
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=0 if do_dequantize else 1,
                group=G,
                order=order,
                is_test=True,
                engine=engine,
                device_option=gc,
            )

            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    group_norm, outputs[0][0])

            net.Proto().op.extend([group_norm])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("gamma").feed(gamma, device_option=gc)
            self.ws.create_blob("beta").feed(beta, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs, atol_scale=2.0)
示例#23
0
    def test_dnnlowp_fully_connected_int(
        self,
        input_channels,
        output_channels,
        batch_size,
        in_quantized,
        out_quantized,
        weight_quantized,
        gc,
        dc,
    ):
        # X and W have scale 1, so exactly represented after quantization
        X_min = -77
        X_max = X_min + 255
        X = np.round(
            np.random.rand(batch_size, input_channels) * (X_max - X_min) +
            X_min)
        X = X.astype(np.float32)
        # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw
        # when multiplied with W_min and W_max
        X[:, 0] = X_min
        X[0, 1] = X_max

        W_min = -100
        W_max = W_min + 255
        W = np.round(
            np.random.rand(output_channels, input_channels) * (W_max - W_min) +
            W_min)
        W = W.astype(np.float32)
        W[0, 0] = W_min
        W[1, 0] = W_max

        # Make sure we won't have overflows from vpmaddubsw instruction used in
        # fbgemm
        avoid_vpmaddubsw_overflow_fc(
            batch_size,
            input_channels,
            output_channels,
            X,
            X_min,
            X_max,
            W,
            W_min,
            W_max,
        )

        b = np.random.randn(output_channels).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("FC", ""),
            ("FC", "DNNLOWP"),
            ("FC", "DNNLOWP_16"),
            ("Int8FC", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            fc = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0])
            net.Proto().op.extend([fc])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
示例#24
0
    def test_groupwise_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        nbits_in_non_outlier,
        share_col_buffer,
        gc,
        dc,
    ):
        if group > 1:
            dilation = 1
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        if nbits_in_non_outlier == 0:
            X, W, b = generate_conv_inputs(
                stride,
                pad,
                kernel,
                dilation,
                size,
                group,
                input_channels_per_group,
                output_channels_per_group,
                batch_size,
                order,
                True,  # group-wise
            )
        else:
            X_min = -77
            X_max = X_min + 255
            X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
            X = np.round(X).astype(np.float32)
            X[..., 0] = X_min
            X[0, 0, 0, 1] = X_max

            W_min = -100
            W_max = W_min + 255
            W = (
                np.random.rand(
                    output_channels, kernel, kernel, input_channels_per_group
                )
                * 4
                - 2
                + W_min
                + 128
            )
            W = np.round(W).astype(np.float32)
            W[..., 1] = W_min + 128  # "zeros"
            for g in range(group):
                W[g * output_channels_per_group, 0, 0, 0] = W_min
                W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
                W[
                    g * output_channels_per_group : (g + 1) * output_channels_per_group,
                ] += g

            if order == "NCHW":
                X = nhwc2nchw(X)
                W = nhwc2nchw(W)

            # No input quantization error in bias
            b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)
    def test_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = (engine == "DNNLOWP" and weight_quantized
                                  and len(outputs) > 0)

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max())
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
示例#26
0
    def test_dnnlowp_average_pool(
        self,
        ndim,
        stride,
        pad,
        kernel,
        size,
        input_channels,
        batch_size,
        order,
        in_quantized,
        gc,
        dc,
    ):
        kernel = 2  # Only kernel size 2 is supported
        assume(kernel <= size)
        assume(pad < kernel)

        C = input_channels
        N = batch_size

        strides = (stride, ) * ndim
        pads = (pad, ) * (ndim * 2)
        kernels = (kernel, ) * ndim
        sizes = (size, ) * ndim

        # X has scale 1, so no input quantization error
        min_ = -100
        max_ = min_ + 255
        if order == "NCHW":
            X = np.round(
                np.random.rand(*((N, C) + sizes)) * (max_ - min_) + min_)
            X = X.astype(np.float32)
            X[(0, ) * (ndim + 2)] = min_
            X[(0, ) * (ndim + 1) + (1, )] = max_
        elif order == "NHWC":
            X = np.round(
                np.random.rand(*((N, ) + sizes + (C, ))) * (max_ - min_) +
                min_)
            X = X.astype(np.float32)
            X[(0, ) * (ndim + 2)] = min_
            X[(
                0,
                1,
            ) + (0, ) * ndim] = max_

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("AveragePool", ""),
            ("AveragePool", "DNNLOWP"),
            ("Int8AveragePool", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            max_pool = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X"],
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                strides=strides,
                kernels=kernels,
                pads=pads,
                order=order,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([max_pool])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)
示例#27
0
    def test_groupwise_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        if batch_size != 0:
            X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[..., 1] = W_min + 128  # "zeros"
        for g in range(group):
            W[g * output_channels_per_group, 0, 0, 0] = W_min
            W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
            if not preserve_weight_sparsity:
                W[g * output_channels_per_group:(g + 1) *
                  output_channels_per_group, ] += g

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order,
                           gc, outputs)

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
示例#28
0
    def test_dnnlowp_max_pool(
        self,
        stride,
        pad,
        kernel,
        size,
        input_channels,
        batch_size,
        order,
        in_quantized,
        gc,
        dc,
    ):
        assume(kernel <= size)
        assume(pad < kernel)

        C = input_channels
        N = batch_size
        H = W = size

        min_ = -10
        max_ = 20
        if order == "NCHW":
            X = np.round(np.random.rand(N, C, H, W) * (max_ - min_) + min_)
        elif order == "NHWC":
            X = np.round(np.random.rand(N, H, W, C) * (max_ - min_) + min_)
        X = X.astype(np.float32)
        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("MaxPool", ""),
            ("MaxPool", "DNNLOWP"),
            ("Int8MaxPool", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            max_pool = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X"],
                ["Y_q" if engine == "DNNLOWP" else "Y"],
                stride=stride,
                kernel=kernel,
                pad=pad,
                order=order,
                engine=engine,
                device_option=gc,
            )
            net.Proto().op.extend([max_pool])

            if engine == "DNNLOWP":
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.run(net)
            outputs.append(
                Output(Y=self.ws.blobs["Y"].fetch(),
                       op_type=op_type,
                       engine=engine))

        # Y_i = max(X_j) so the only error is in quantization of inputs
        check_quantized_results_close(outputs, ref=X)
    def test_groupwise_dnnlowp_conv_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume((not prepack_weight) or order == "NHWC")

        X, W, b = generate_conv_inputs(
            stride,
            pad,
            kernel,
            dilation,
            size,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
            groupwise_quantization=True,
            preserve_activation_sparsity=preserve_activation_sparsity,
            preserve_weight_sparsity=preserve_weight_sparsity,
        )

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP"),
            ("Conv", "DNNLOWP_16"),
            ("Int8Conv", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                x_q_param = hardcode_scale_zp.choose_quantization_params(
                    X.min(), X.max()
                )
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    quantize_groupwise=1,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else "W",
                    "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Y_q"],
                    ["Y"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
    def test_dnnlowp_concat_int(self, dim1, dim2, in_quantized, out_quantized,
                                gc, dc):

        # X has scale 1, so exactly represented after quantization
        min_ = -100
        max_ = min_ + 255
        X = np.round(np.random.rand(dim1, dim2) * (max_ - min_) + min_)
        X = X.astype(np.float32)
        X[0, 0] = min_
        X[0, 1] = max_

        # Y has scale 1/2, so exactly represented after quantization
        Y = np.round(np.random.rand(dim1, dim2) * 255 / 2 - 64)
        Y = Y.astype(np.float32)
        Y[0, 0] = -64
        Y[0, 1] = 127. / 2

        Output = collections.namedtuple("Output", ["Z", "op_type", "engine"])
        outputs = []

        op_engine_list = [
            ("Concat", ""),
            ("Concat", "DNNLOWP"),
            ("Int8Concat", "DNNLOWP"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized

            if do_quantize:
                quantize_x = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    engine=engine,
                    device_option=gc,
                )
                quantize_y = core.CreateOperator(
                    "Quantize",
                    ["Y"],
                    ["Y_q"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([quantize_x, quantize_y])

            concat = core.CreateOperator(
                op_type,
                ["X_q", "Y_q"] if do_quantize else ["X", "Y"],
                ["Z_q" if do_dequantize else "Z", "split"],
                dequantize_output=not do_dequantize,
                engine=engine,
                device_option=gc,
                axis=0,
            )
            net.Proto().op.extend([concat])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize",
                    ["Z_q"],
                    ["Z"],
                    engine=engine,
                    device_option=gc,
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("Y").feed(Y, device_option=gc)
            self.ws.create_blob("split")
            self.ws.run(net)
            outputs.append(
                Output(Z=self.ws.blobs["Z"].fetch(),
                       op_type=op_type,
                       engine=engine))

        check_quantized_results_close(outputs)