示例#1
0
 def test_convolution_separate_stride_pad_layout(
     self,
     op_type,
     stride_h,
     stride_w,
     pad_t,
     pad_l,
     pad_b,
     pad_r,
     kernel,
     size,
     input_channels,
     output_channels,
     batch_size,
     engine,
     use_bias,
     gc,
     dc,
 ):
     X = (np.random.rand(batch_size, size, size, input_channels).astype(
         np.float32) - 0.5)
     w = (np.random.rand(output_channels, kernel, kernel,
                         input_channels).astype(np.float32) - 0.5)
     b = np.random.rand(output_channels).astype(np.float32) - 0.5
     outputs = {}
     for order in ["NCHW", "NHWC"]:
         op = core.CreateOperator(
             op_type,
             ["X", "w", "b"] if use_bias else ["X", "w"],
             ["Y"],
             stride_h=stride_h,
             stride_w=stride_w,
             kernel=kernel,
             pad_t=pad_t,
             pad_l=pad_l,
             pad_b=pad_b,
             pad_r=pad_r,
             order=order,
             engine=engine,
             device_option=gc,
         )
         if order == "NCHW":
             X_f = utils.NHWC2NCHW(X)
             w_f = utils.NHWC2NCHW(w)
         else:
             X_f = X
             w_f = w
         self.ws.create_blob("X").feed(X_f, device_option=gc)
         self.ws.create_blob("w").feed(w_f, device_option=gc)
         self.ws.create_blob("b").feed(b, device_option=gc)
         self.ws.run(op)
         outputs[order] = self.ws.blobs["Y"].fetch()
     np.testing.assert_allclose(outputs["NCHW"],
                                utils.NHWC2NCHW(outputs["NHWC"]),
                                atol=1e-4,
                                rtol=1e-4)
    def test_conv_separate_stride_pad_gradients(self, stride_h, stride_w,
                                                pad_h, pad_w, kernel, size,
                                                input_channels, output_channels,
                                                batch_size, order, engine,
                                                shared_buffer, use_bias,
                                                deformable_group, gc, dc):
        op = core.CreateOperator(
            "DeformConv",
            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
            ["Y"],
            stride_h=stride_h,
            stride_w=stride_w,
            pad_t=pad_h,
            pad_l=pad_w,
            pad_b=pad_h,
            pad_r=pad_w,
            kernel=kernel,
            order=order,
            engine=engine,
            shared_buffer=int(shared_buffer),
            deformable_group=deformable_group,
        )
        X = np.random.rand(
            batch_size, size, size, input_channels).astype(np.float32) - 0.5
        output_size = _conv_2d_output_size(size, kernel, pad_h, pad_w, 1,
                                           stride_h, stride_w)
        o = _conv_2d_random_offsets(batch_size, kernel, output_size,
                                    deformable_group)
        w = np.random.rand(
            output_channels, kernel, kernel, input_channels).astype(np.float32)\
            - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, o, w, b] if use_bias else [X, o, w]

        # Error handling path.
        if size + pad_h < kernel or size + pad_w < kernel:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return
        if input_channels % deformable_group != 0:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return
        if output_channels % deformable_group != 0:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return

        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])
    def test_conv_gradients(self, stride, pad, kernel, dilation, size,
                            input_channels, output_channels, batch_size, order,
                            engine, use_bias, deformable_group, gc, dc):
        dkernel = dilation * (kernel - 1) + 1

        if gc.device_type == caffe2_pb2.CUDA and engine == 'CUDNN':
            assume(_cudnn_supports(dilation=(dilation > 1),
                                   nhwc=(order == 'NHWC')))

        assume(engine != "MKLDNN" or use_bias is True)

        op = core.CreateOperator(
            "DeformConv",
            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            dilation=dilation,
            pad=pad,
            order=order,
            engine=engine,
            deformable_group=deformable_group,
        )
        X = np.random.rand(
            batch_size, size, size, input_channels).astype(np.float32) - 0.5
        output_size = _conv_2d_output_size(size, kernel, pad, pad,
                                           dilation, stride, stride)
        o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group)
        w = np.random.rand(
            output_channels, kernel, kernel, input_channels).astype(np.float32)\
            - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, o, w, b] if use_bias else [X, o, w]
        # Error handling path.
        if size + pad + pad < dkernel or size + pad + pad < dkernel:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return
        if input_channels % deformable_group != 0:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return
        if output_channels % deformable_group != 0:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return

        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])
示例#4
0
 def test_convolution_transpose_separate_stride_pad_adj_layout(
         self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
         adj_h, adj_w, size, input_channels, output_channels, batch_size,
         engine, use_bias, gc, dc):
     assume(adj_h < stride_h)
     assume(adj_w < stride_w)
     X = np.random.rand(batch_size, size, size, input_channels).astype(
         np.float32) - 0.5
     w = np.random.rand(
         input_channels, kernel, kernel, output_channels)\
         .astype(np.float32) - 0.5
     b = np.random.rand(output_channels).astype(np.float32) - 0.5
     outputs = {}
     for order in ["NCHW", "NHWC"]:
         op = core.CreateOperator(
             "ConvTranspose",
             ["X", "w", "b"] if use_bias else ["X", "w"],
             ["Y"],
             stride_h=stride_h,
             stride_w=stride_w,
             kernel=kernel,
             pad_t=pad_t,
             pad_l=pad_l,
             pad_b=pad_b,
             pad_r=pad_r,
             adj_h=adj_h,
             adj_w=adj_w,
             order=order,
             engine=engine,
             device_option=gc,
         )
         if order == "NCHW":
             X_f = utils.NHWC2NCHW(X)
             w_f = utils.NHWC2NCHW(w)
         else:
             X_f = X
             w_f = w
         self.assertDeviceChecks(dc, op,
                                 [X_f, w_f, b] if use_bias else [X_f, w_f],
                                 [0])
         self.ws.create_blob("X").feed(X_f, device_option=gc)
         self.ws.create_blob("w").feed(w_f, device_option=gc)
         self.ws.create_blob("b").feed(b, device_option=gc)
         self.ws.run(op)
         outputs[order] = self.ws.blobs["Y"].fetch()
     output_h = (size - 1) * stride_h + kernel + adj_h - pad_t - pad_b
     output_w = (size - 1) * stride_w + kernel + adj_w - pad_l - pad_r
     self.assertEqual(outputs["NCHW"].shape,
                      (batch_size, output_channels, output_h, output_w))
     np.testing.assert_allclose(outputs["NCHW"],
                                utils.NHWC2NCHW(outputs["NHWC"]),
                                atol=1e-4,
                                rtol=1e-4)
示例#5
0
 def test_convolution_transpose_layout(self, stride, pad, kernel, adj, size,
                                       input_channels, output_channels,
                                       batch_size, engine, shared_buffer,
                                       use_bias, gc, dc):
     assume(adj < stride)
     X = np.random.rand(batch_size, size, size, input_channels).astype(
         np.float32) - 0.5
     w = np.random.rand(
         input_channels, kernel, kernel, output_channels)\
         .astype(np.float32) - 0.5
     b = np.random.rand(output_channels).astype(np.float32) - 0.5
     outputs = {}
     for order in ["NCHW", "NHWC"]:
         if hiputl.run_in_hip(gc, dc) and order == "NHWC":
             # MIOPEN doesn't work with NHWC, fallback to use normal hip
             tmp_engine = ""
         else:
             tmp_engine = engine
         op = core.CreateOperator(
             "ConvTranspose",
             ["X", "w", "b"] if use_bias else ["X", "w"],
             ["Y"],
             strides=[stride] * 2,
             kernels=[kernel] * 2,
             pads=[pad] * 4,
             adjs=[adj] * 2,
             order=order,
             engine=tmp_engine,
             shared_buffer=int(shared_buffer),
             device_option=gc,
         )
         if order == "NCHW":
             X_f = utils.NHWC2NCHW(X)
             w_f = utils.NHWC2NCHW(w)
         else:
             X_f = X
             w_f = w
         self.assertDeviceChecks(dc, op,
                                 [X_f, w_f, b] if use_bias else [X_f, w_f],
                                 [0])
         self.ws.create_blob("X").feed(X_f, device_option=gc)
         self.ws.create_blob("w").feed(w_f, device_option=gc)
         self.ws.create_blob("b").feed(b, device_option=gc)
         self.ws.run(op)
         outputs[order] = self.ws.blobs["Y"].fetch()
     output_size = (size - 1) * stride + kernel + adj - 2 * pad
     self.assertEqual(
         outputs["NCHW"].shape,
         (batch_size, output_channels, output_size, output_size))
     np.testing.assert_allclose(outputs["NCHW"],
                                utils.NHWC2NCHW(outputs["NHWC"]),
                                atol=1e-4,
                                rtol=1e-4)
示例#6
0
    def test_group_convolution(self, stride, pad, kernel, size, group,
                               input_channels_per_group,
                               output_channels_per_group, batch_size, order,
                               engine, use_bias, gc, dc):
        assume(size >= kernel)

        if hiputl.run_in_hip(gc, dc):
            if order == "NHWC":
                assume(group == 1 and engine != "CUDNN")
        else:
            # TODO: Group conv in NHWC not implemented for GPU yet.
            assume(group == 1 or order == "NCHW"
                   or gc.device_type == caffe2_pb2.CPU)

            if group != 1 and order == "NHWC":
                dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        # Group conv not implemented with EIGEN engine.
        assume(group == 1 or engine != "EIGEN")

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        op = core.CreateOperator(
            "Conv",
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            pad=pad,
            order=order,
            engine=engine,
            group=group,
        )
        X = np.random.rand(batch_size, size, size, input_channels).astype(
            np.float32) - 0.5
        w = np.random.rand(
            output_channels, kernel, kernel,
            input_channels_per_group).astype(np.float32)\
            - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]

        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])
示例#7
0
    def test_convolution_transpose_separate_stride_pad_adj_gradient(
            self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
            adj_h, adj_w, size, input_channels, output_channels, batch_size,
            order, engine, use_bias, compute_dX, gc, dc):
        assume(adj_h < stride_h)
        assume(adj_w < stride_w)
        X = np.random.rand(batch_size, size, size, input_channels).astype(
            np.float32) - 0.5
        w = np.random.rand(
            input_channels, kernel, kernel, output_channels)\
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        op = core.CreateOperator(
            "ConvTranspose",
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride_h=stride_h,
            stride_w=stride_w,
            kernel=kernel,
            pad_t=pad_t,
            pad_l=pad_l,
            pad_b=pad_b,
            pad_r=pad_r,
            adj_h=adj_h,
            adj_w=adj_w,
            order=order,
            engine=engine,
            no_gradient_to_input=not compute_dX,
        )
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]
        self.assertDeviceChecks(dc, op, inputs, [0])

        if use_bias and compute_dX:
            # w, b, X
            outputs_to_check = [1, 2, 0]
        elif use_bias:
            # w, b
            outputs_to_check = [1, 2]
        elif compute_dX:
            # w, X
            outputs_to_check = [1, 0]
        else:
            # w
            outputs_to_check = [1]
        for i in outputs_to_check:
            self.assertGradientChecks(gc, op, inputs, i, [0])
示例#8
0
    def test_instance_norm_model_helper(self, N, C, H, W, order, epsilon, seed,
                                        is_test):
        np.random.seed(seed)
        model = model_helper.ModelHelper(name="test_model")
        brew.instance_norm(model,
                           'input',
                           'output',
                           C,
                           epsilon=epsilon,
                           order=order,
                           is_test=is_test)

        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
        if order == 'NHWC':
            input_blob = utils.NCHW2NHWC(input_blob)

        self.ws.create_blob('input').feed(input_blob)

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()

        if is_test:
            scale = self.ws.blobs['output_s'].fetch()
            assert scale is not None
            assert scale.shape == (C, )
            bias = self.ws.blobs['output_b'].fetch()
            assert bias is not None
            assert bias.shape == (C, )

        output_blob = self.ws.blobs['output'].fetch()
        if order == 'NHWC':
            output_blob = utils.NHWC2NCHW(output_blob)

        assert output_blob.shape == (N, C, H, W)
示例#9
0
    def test_pooling(self, stride, pad, kernel, size,
                     input_channels, batch_size,
                     order, op_type, engine, gc, dc):
        assume(pad < kernel)
        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
            assume(order == "NCHW" and op_type != "LpPool")

        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            pad=pad,
            order=order,
            engine=engine,
        )
        X = np.random.rand(
            batch_size, size, size, input_channels).astype(np.float32)
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0])
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0])
示例#10
0
    def test_pooling_separate_stride_pad(self, stride_h, stride_w, pad_t,
                                         pad_l, pad_b, pad_r, kernel, size,
                                         input_channels, batch_size, order,
                                         op_type, gc, dc):
        assume(np.max([pad_t, pad_l, pad_b, pad_r]) < kernel)

        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            stride_h=stride_h,
            stride_w=stride_w,
            pad_t=pad_t,
            pad_l=pad_l,
            pad_b=pad_b,
            pad_r=pad_r,
            kernel=kernel,
            order=order,
        )
        X = np.random.rand(batch_size, size, size,
                           input_channels).astype(np.float32)

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
        self.assertDeviceChecks(dc, op, [X], [0])
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0])
示例#11
0
    def test_pooling_3d(self, stride, pad, kernel, size, input_channels,
                        batch_size, order, op_type, engine, gc, dc):
        assume(pad < kernel)
        assume(size + pad + pad >= kernel)
        # Currently MIOpen Pooling only supports pooling with NCHW order.
        if hiputl.run_in_hip(gc, dc) and (workspace.GetHIPVersion() < 303
                                          or order == "NHWC"):
            assume(engine != "CUDNN")
        # some case here could be calculated with global pooling, but instead
        # calculated with general implementation, slower but should still
        # be correct.
        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            strides=[stride] * 3,
            kernels=[kernel] * 3,
            pads=[pad] * 6,
            order=order,
            engine=engine,
        )
        X = np.random.rand(batch_size, size, size, size,
                           input_channels).astype(np.float32)
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
示例#12
0
    def test_convolution_transpose_gradients(self, stride, pad, kernel, adj,
                                             size, input_channels,
                                             output_channels, batch_size,
                                             order, engine, use_bias,
                                             compute_dX, gc, dc):
        assume(adj < stride)
        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
            assume(order == "NCHW")
        X = np.random.rand(batch_size, size, size, input_channels).astype(
            np.float32) - 0.5
        w = np.random.rand(
            input_channels, kernel, kernel, output_channels)\
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        op = core.CreateOperator(
            "ConvTranspose",
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            pad=pad,
            adj=adj,
            order=order,
            engine=engine,
            no_gradient_to_input=not compute_dX,
        )
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]
        self.assertDeviceChecks(dc, op, inputs, [0])

        if use_bias and compute_dX:
            # w, b, X
            outputs_to_check = [1, 2, 0]
        elif use_bias:
            # w, b
            outputs_to_check = [1, 2]
        elif compute_dX:
            # w, X
            outputs_to_check = [1, 0]
        else:
            # w
            outputs_to_check = [1]
        for i in outputs_to_check:
            self.assertGradientChecks(gc, op, inputs, i, [0])
示例#13
0
    def test_convolution_transpose_with_group(self, stride, pad, kernel, adj,
                                              size, input_channels,
                                              output_channels, batch_size,
                                              group, order, engine,
                                              shared_buffer, use_bias, gc, dc):
        assume(adj < stride)
        # TODO: Group conv_transpose in NHWC not implemented for GPU yet.
        assume(group == 1 or order == "NCHW"
               or gc.device_type == caffe2_pb2.CPU)
        if group != 1 and order == "NHWC":
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        if hiputl.run_in_hip(gc, dc) and order == "NHWC":
            engine = ""

        op = core.CreateOperator(
            "ConvTranspose",
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            pad=pad,
            adj=adj,
            group=group,
            order=order,
            engine=engine,
            shared_buffer=int(shared_buffer),
            device_option=gc,
        )

        input_channels *= group
        output_channels *= group

        X = np.random.rand(batch_size, size, size, input_channels).astype(
            np.float32) - 0.5
        w = np.random.rand(
            input_channels, kernel, kernel, int(output_channels / group)) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]
        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])
示例#14
0
 def channel_shuffle_ref(X):
     if order == "NHWC":
         X = utils.NHWC2NCHW(X)
     Y_r = X.reshape(X.shape[0], groups, X.shape[1] // groups,
                     X.shape[2], X.shape[3])
     Y_trns = Y_r.transpose((0, 2, 1, 3, 4))
     Y_reshaped = Y_trns.reshape(X.shape)
     if order == "NHWC":
         Y_reshaped = utils.NCHW2NHWC(Y_reshaped)
     return Y_reshaped
示例#15
0
 def test_leaky_relu_layout(self, gc, dc, N, C, H, W, alpha, seed):
     outputs = {}
     for order in ('NCHW', 'NHWC'):
         np.random.seed(seed)
         input_blobs = self._get_inputs(N, C, H, W, order)
         self._feed_inputs(input_blobs, device_option=gc)
         op = self._get_op(device_option=gc, alpha=alpha, order=order)
         self.ws.run(op)
         outputs[order] = self.ws.blobs['output'].fetch()
     np.testing.assert_allclose(outputs['NCHW'],
                                utils.NHWC2NCHW(outputs["NHWC"]),
                                atol=1e-4,
                                rtol=1e-4)
示例#16
0
    def test_pooling_with_index(self, stride, pad, kernel, size,
                                input_channels, batch_size, gc, dc):
        assume(pad < kernel)
        op = core.CreateOperator(
            "MaxPoolWithIndex",
            ["X"],
            ["Y", "Y_index"],
            stride=stride,
            kernel=kernel,
            pad=pad,
            order="NCHW",
            deterministic=1,
        )
        X = np.random.rand(batch_size, size, size,
                           input_channels).astype(np.float32)

        # transpose due to order = NCHW
        X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0])
示例#17
0
    def test_pooling_1d(self, stride, pad, kernel, size, input_channels,
                        batch_size, order, op_type, gc, dc):
        assume(pad < kernel)
        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            strides=[stride],
            kernels=[kernel],
            pads=[pad, pad],
            order=order,
            engine="",
        )
        X = np.random.rand(batch_size, size, input_channels).astype(np.float32)
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0])
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0])
示例#18
0
    def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean,
                                  store_inv_stdev, epsilon, seed):
        # force store_inv_stdev if store_mean to match existing forward pass
        # implementation
        store_inv_stdev |= store_mean

        outputs = {}
        for order in ('NCHW', 'NHWC'):
            np.random.seed(seed)
            input_blobs = self._get_inputs(N, C, H, W, order)
            self._feed_inputs(input_blobs, device_option=gc)
            op = self._get_op(device_option=gc,
                              store_mean=store_mean,
                              store_inv_stdev=store_inv_stdev,
                              epsilon=epsilon,
                              order=order)
            self.ws.run(op)
            outputs[order] = self.ws.blobs['output'].fetch()
        np.testing.assert_allclose(outputs['NCHW'],
                                   utils.NHWC2NCHW(outputs["NHWC"]),
                                   atol=1e-4,
                                   rtol=1e-4)
示例#19
0
    def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha,
                                            seed):
        np.random.seed(seed)
        arg_scope = {'order': order}
        model = model_helper.ModelHelper(name="test_model",
                                         arg_scope=arg_scope)
        model.LeakyRelu('input', 'output', alpha=alpha)

        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
        if order == 'NHWC':
            input_blob = utils.NCHW2NHWC(input_blob)

        self.ws.create_blob('input').feed(input_blob)

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()

        output_blob = self.ws.blobs['output'].fetch()
        if order == 'NHWC':
            output_blob = utils.NHWC2NCHW(output_blob)

        assert output_blob.shape == (N, C, H, W)
示例#20
0
    def test_global_pooling(self, size, input_channels, batch_size, order,
                            op_type, engine, gc, dc):
        # CuDNN 5 does not support deterministic max pooling.
        assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool")

        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
            assume(order == "NCHW" and op_type != "LpPool")
        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            order=order,
            engine=engine,
            global_pooling=True,
        )
        X = np.random.rand(batch_size, size, size,
                           input_channels).astype(np.float32)
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0])
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0])
示例#21
0
    def test_global_pooling_3d(self, kernel, size, input_channels, batch_size,
                               order, op_type, engine, gc, dc):
        # Currently MIOpen Pooling only supports 2d pooling
        if hiputl.run_in_hip(gc, dc):
            assume(engine != "CUDNN")
        # pad and stride ignored because they will be infered in global_pooling
        op = core.CreateOperator(
            op_type,
            ["X"],
            ["Y"],
            kernels=[kernel] * 3,
            order=order,
            global_pooling=True,
            engine=engine,
        )
        X = np.random.rand(batch_size, size, size, size,
                           input_channels).astype(np.float32)
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)

        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
        if 'MaxPool' not in op_type:
            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
示例#22
0
        def ref(input_blob, scale_blob, bias_blob):
            if order == 'NHWC':
                input_blob = utils.NHWC2NCHW(input_blob)

            mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2)
            inv_stdev_blob = 1.0 / \
                np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon)
            # _bc indicates blobs that are reshaped for broadcast
            scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis]
            mean_bc = mean_blob[:, :, np.newaxis, np.newaxis]
            inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis]
            bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis]
            normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \
                + bias_bc

            if order == 'NHWC':
                normalized_blob = utils.NCHW2NHWC(normalized_blob)

            if not store_mean and not store_inv_stdev:
                return normalized_blob,
            elif not store_inv_stdev:
                return normalized_blob, mean_blob
            else:
                return normalized_blob, mean_blob, inv_stdev_blob
示例#23
0
    def test_groupwise_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        if batch_size != 0:
            X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[..., 1] = W_min + 128  # "zeros"
        for g in range(group):
            W[g * output_channels_per_group, 0, 0, 0] = W_min
            W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
            if not preserve_weight_sparsity:
                W[g * output_channels_per_group:(g + 1) *
                  output_channels_per_group, ] += g

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            conv = core.CreateOperator(
                op_type,
                ["X_q" if do_quantize else "X", "W", "b"],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order,
                           gc, outputs)

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)
示例#24
0
    def test_groupwise_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        X_min = -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        if batch_size != 0:
            X[0, 0, 0, 1] = X_max

        W_min = -100
        W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[..., 1] = W_min + 128  # "zeros"
        for g in range(group):
            W[g * output_channels_per_group, 0, 0, 0] = W_min
            W[g * output_channels_per_group + 1, 0, 0, 0] = W_max
            W[g * output_channels_per_group:(g + 1) *
              output_channels_per_group, ] += g

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine="DNNLOWP",
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            if do_prepack_weight:
                X_min = 0 if X.size == 0 else X.min()
                X_max = 0 if X.size == 0 else X.max()
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X_min, X_max)
                inputs = ["W"]
                if do_dequantize:
                    inputs += ["b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    engine=engine,
                    group=group,
                    quantize_groupwise=1,
                    in_scale=x_q_param.scale,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else "W",
                    "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                engine=engine,
                group=group,
                quantize_groupwise=1,
                device_option=gc,
            )
            if do_dequantize or do_prepack_weight:
                # groupwise quantization only works with static quantization
                # so we need to set quantization parameters
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine,
                           order, gc, outputs)

        check_quantized_results_close(outputs)
 def lc_2d_nhwc(X, W, b=None):
     XT = utils.NHWC2NCHW(X)
     WT = np.transpose(W, [0, 1, 2, 5, 3, 4])
     output = lc_2d_nchw(XT, WT, b)
     return [utils.NCHW2NHWC(output[0])]
示例#26
0
    def _test_dnnlowp_nd_int(
        self,
        stride,
        pad,
        kernels,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        prepack_weight,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume((not prepack_weight) or order == "NHWC")
        ndim = len(kernels)

        X, W, b = generate_convnd_inputs(
            (stride, ) * ndim,
            (pad, ) * ndim,
            kernels,
            (dilation, ) * ndim,
            (size, ) * ndim,
            group,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            order,
        )

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [("Conv", ""), ("Conv", "DNNLOWP_16"),
                          ("Int8Conv", "DNNLOWP")]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            fall_back_to_NCHW = "DNNLOWP" not in engine and order == "NHWC"

            if fall_back_to_NCHW:
                X_nchw = utils.NHWC2NCHW(X)
                W_nchw = utils.NHWC2NCHW(W)

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            # If output scale/zp aren't set, it gets computed from ref fp32 op
            # in DNNLOWP, which isn't possible when we quantize input weights.
            # Make sure atleast one output is collected to compute output
            # scale/zp.
            do_quantize_weight = engine == "DNNLOWP" and len(outputs) > 0
            do_prepack_weight = engine == "DNNLOWP" and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator("Quantize", ["X"], ["X_q"],
                                               engine=engine,
                                               device_option=gc)
                net.Proto().op.extend([quantize])

            x_q_param = dnnlowp_utils.choose_quantization_params(
                X.min(), X.max())
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q")
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else
                    ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                strides=[stride] * ndim,
                kernels=kernels,
                dilations=[dilation] * ndim,
                pads=[pad] * (ndim * 2),
                order="NCHW" if fall_back_to_NCHW else order,
                dequantize_output=not do_dequantize,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0])
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine=engine,
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X_nchw if fall_back_to_NCHW else X,
                                          device_option=gc)
            self.ws.create_blob("W").feed(W_nchw if fall_back_to_NCHW else W,
                                          device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(init_net)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            if fall_back_to_NCHW:
                Y = utils.NCHW2NHWC(Y)
            outputs.append(
                Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs)
    def test_dnnlowp_conv_acc16_int(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        in_quantized,
        out_quantized,
        weight_quantized,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        # X and W have scale 1, so exactly represented after quantization
        # This was made sure by having at least one 0 and one 255 for unsigned
        # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit
        # tensors.
        # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use
        # small numbers except for those 0, 255, -128, and 127, for this test
        # We also make sure 255, -128, or 127 are not multiplied together by
        # putting them in different input channels and the corresponding input
        # channel in other matrix is 0.
        # For example, we put 255 in input channel 1 in X, so we make the
        # corresponding input channel in W all zeros.
        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (
            np.random.rand(output_channels, kernel, kernel, input_channels_per_group)
            * 4
            - 2
            + W_min
            + 128
        )
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        # No input quantization error in bias
        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine and in_quantized
            do_dequantize = "DNNLOWP" in engine and out_quantized
            do_quantize_weight = (
                "DNNLOWP" in engine and weight_quantized and len(outputs) > 0
            )

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity
                )
                net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                x_q_param = dnnlowp_utils.choose_quantization_params(
                    X.min(), X.max(), preserve_activation_sparsity
                )
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param
                )
                net.Proto().op.extend([int8_bias_tensor_fill])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_q" if do_quantize_weight else "W",
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                dequantize_output=not do_dequantize,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity
                )
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator(
                    "Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc
                )
                net.Proto().op.extend([dequantize])

            self.ws.create_blob("X").feed(X, device_option=gc)
            self.ws.create_blob("W").feed(W, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(net)
            Y = self.ws.blobs["Y"].fetch()
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))

        check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
示例#28
0
 def canonical(o):
     if o.order == "NHWC":
         return utils.NHWC2NCHW(o.Y)
     else:
         return o.Y
示例#29
0
def generate_convnd_inputs(
    strides,
    pads,
    kernels,
    dilations,
    sizes,
    group,
    input_channels_per_group,
    output_channels_per_group,
    batch_size,
    order,
    groupwise_quantization=False,
    preserve_activation_sparsity=False,
    preserve_weight_sparsity=False,
):
    dim = len(sizes)
    assume(all(len(a) == dim for a in [strides, pads, kernels, dilations]))
    assume(
        all(sizes[d] >= dilations[d] * (kernels[d] - 1) + 1
            for d in range(dim)))
    input_channels = input_channels_per_group * group
    output_channels = output_channels_per_group * group
    depthwise_convolution = (input_channels_per_group == 1
                             and output_channels_per_group == 1)

    assert input_channels > 1
    assert output_channels > 1

    # X and W have scale 1, so exactly represented after quantization
    X_min = 0 if preserve_activation_sparsity else -77
    X_max = X_min + 255
    X_range = X_max - X_min
    if depthwise_convolution and groupwise_quantization:
        # For depthwise convolution, it's not enough to set input channel 0
        # to all X_min to avoid overflow from vpmaddubsw
        X_range /= 2
    X = np.round(
        np.random.rand(*((batch_size, ) + tuple(sizes) +
                         (input_channels, ))) * X_range + X_min)
    X = X.astype(np.float32)
    if (batch_size != 0 and depthwise_convolution and groupwise_quantization
            and not preserve_activation_sparsity):
        # Put X_max in a position not to be paired with any padded value.
        # Put X_min to all positions that can be paired with the X_max value.
        #
        # This is an example of a pattern for 3x3x3
        #  .   .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .  min
        #
        #  .   .   .   .   .
        #  .   .   .   .  min
        #  .  min max min  .
        # min  .   .   .   .
        #  .   .   .   .   .
        #
        # min  .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .   .
        #  .   .   .   .   .

        # Make sure we have enough dimension
        assert X.shape[1] >= 3
        assert all(X.shape[d + 1] >= kernels[d] + 2 for d in range(1, dim))

        # Take subtensor we want to manipulate
        X_sub = X[(0, ) * (X.ndim - dim - 1) + (slice(None), ) * dim + (0, )]

        # Put X_max in the middle of the subtensor
        X_sub[(1, ) + tuple(kernels[d] // 2 + 1
                            for d in range(1, dim))] = X_max

        # Put X_min to the positions that can be paired with X_max across
        # the slowest moving dimension
        X_sub[[[0, 2]] + [[kernels[d] + 1, 0] for d in range(1, dim)]] = X_min

        # Put X_min to other positions that can be paired with X_max
        for d1 in range(1, dim):
            X_sub[[[1]] + [[kernels[d2] // 2 + 1] for d2 in range(1, d1)] +
                  [[kernels[d1] // 2, kernels[d1] // 2 + 2]] +
                  [[kernels[d2] + 1, 0] for d2 in range(d1 + 1, dim)]] = X_min
    else:
        # input channel 0 is all X_min to avoid overflow from vpmaddubsw when
        # multiplied with W_min and W_max
        X[..., 0] = X_min
        if batch_size != 0:
            X[(0, ) * (X.ndim - 1) + (1, )] = X_max

    if preserve_weight_sparsity:
        W_min = -128
        W_max = 100
    else:
        W_min = -100
        W_max = W_min + 255
    W = np.round(
        np.random.rand(*((output_channels, ) + tuple(kernels) +
                         (input_channels_per_group, ))) * (W_max - W_min) +
        W_min)
    W = W.astype(np.float32)
    if groupwise_quantization:
        for g in range(group):
            W[(g * output_channels_per_group, ) + (0, ) * (W.ndim - 1)] = W_min
            if depthwise_convolution:
                W[(g * output_channels_per_group, 1) + (0, ) *
                  (W.ndim - 2)] = W_max
            else:
                assert output_channels_per_group > 1
                W[(g * output_channels_per_group + 1, ) + (0, ) *
                  (W.ndim - 1)] = W_max

            # Make sure each group has different ranges to really see the effect
            # of group-wise quantization.
            if not preserve_weight_sparsity:
                W[g * output_channels_per_group:(g + 1) *
                  output_channels_per_group, ] += g
    else:
        W[(0, ) + (0, ) * (W.ndim - 1)] = W_min
        W[(1, ) + (0, ) * (W.ndim - 1)] = W_max

    different_range_per_group = groupwise_quantization and not preserve_weight_sparsity
    for g in range(group):
        avoid_vpmaddubsw_overflow(
            strides,
            pads,
            kernels,
            dilations,
            sizes,
            input_channels_per_group,
            output_channels_per_group,
            batch_size,
            X[...,
              g * input_channels_per_group:(g + 1) * input_channels_per_group],
            X_min,
            X_max,
            W[g * output_channels_per_group:(g + 1) *
              output_channels_per_group, ],
            W_min + (g if different_range_per_group else 0),
            W_max + (g if different_range_per_group else 0),
        )

    if order == "NCHW":
        X = utils.NHWC2NCHW(X)
        W = utils.NHWC2NCHW(W)

    b = np.random.randn(output_channels).astype(np.float32)

    return X, W, b
示例#30
0
    def test_dnnlowp_conv_acc16_outlier(
        self,
        stride,
        pad,
        kernel,
        dilation,
        size,
        group,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        order,
        weight_quantized,
        prepack_weight,
        nbits_in_non_outlier,
        share_col_buffer,
        preserve_activation_sparsity,
        preserve_weight_sparsity,
        gc,
        dc,
    ):
        assume(group == 1 or dilation == 1)
        assume(size >= dilation * (kernel - 1) + 1)

        input_channels = input_channels_per_group * group
        output_channels = output_channels_per_group * group

        X_min = 0 if preserve_activation_sparsity else -77
        X_max = X_min + 255
        X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min
        X = np.round(X).astype(np.float32)
        X[..., 0] = X_min
        if batch_size != 0:
            X[0, 0, 0, 1] = X_max

        if preserve_weight_sparsity:
            W_min = -128
            W_max = 100
        else:
            W_min = -100
            W_max = W_min + 255
        W = (np.random.rand(output_channels, kernel, kernel,
                            input_channels_per_group) * 4 - 2 + W_min + 128)
        W = np.round(W).astype(np.float32)
        W[0, 0, 0, 0] = W_min
        W[1, 0, 0, 0] = W_max
        W[..., 1] = W_min + 128  # "zeros"

        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            W = utils.NHWC2NCHW(W)

        b = np.round(np.random.randn(output_channels)).astype(np.float32)

        Output = collections.namedtuple("Output",
                                        ["Y", "op_type", "engine", "order"])
        outputs = []

        op_engine_list = [
            ("Conv", ""),
            ("Conv", "DNNLOWP_ACC16"),
            ("Int8Conv", "DNNLOWP_ACC16"),
        ]

        for op_type, engine in op_engine_list:
            init_net = core.Net("test_init_net")
            net = core.Net("test_net")

            do_quantize = "DNNLOWP" in engine
            do_dequantize = "DNNLOWP" in engine
            do_quantize_weight = "DNNLOWP" in engine and weight_quantized
            do_prepack_weight = "DNNLOWP" in engine and prepack_weight

            if do_quantize:
                quantize = core.CreateOperator(
                    "Quantize",
                    ["X"],
                    ["X_q"],
                    preserve_activation_sparsity=preserve_activation_sparsity,
                    engine="DNNLOWP",
                    device_option=gc,
                )
                net.Proto().op.extend([quantize])

            X_min = 0 if X.size == 0 else X.min()
            X_max = 0 if X.size == 0 else X.max()
            x_q_param = dnnlowp_utils.choose_quantization_params(
                X_min, X_max, preserve_activation_sparsity)
            if do_quantize_weight:
                int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill(
                    W, "W_q", preserve_weight_sparsity)
                init_net.Proto().op.extend([int8_given_tensor_fill])

                # Bias
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    b, "b_q", x_q_param, w_q_param)
                init_net.Proto().op.extend([int8_bias_tensor_fill])

            if do_prepack_weight:
                inputs = ["W_q" if do_quantize_weight else "W"]
                if do_dequantize:
                    inputs += ["b_q" if do_quantize_weight else "b"]
                pack = core.CreateOperator(
                    "Int8ConvPackWeight",
                    inputs,
                    ["W_packed"],
                    group=group,
                    nbits_in_non_outlier=nbits_in_non_outlier,
                    preserve_weight_sparsity=preserve_weight_sparsity,
                    in_scale=x_q_param.scale,
                    engine=engine,
                )
                init_net.Proto().op.extend([pack])

            conv = core.CreateOperator(
                op_type,
                [
                    "X_q" if do_quantize else "X",
                    "W_packed" if do_prepack_weight else
                    ("W_q" if do_quantize_weight else "W"),
                    "b_q" if do_quantize_weight else "b",
                ],
                ["Y_q" if do_dequantize else "Y"],
                stride=stride,
                kernel=kernel,
                dilation=dilation,
                pad=pad,
                order=order,
                nbits_in_non_outlier=nbits_in_non_outlier,
                shared_buffer=(1 if share_col_buffer else 0),
                preserve_activation_sparsity=preserve_activation_sparsity,
                preserve_weight_sparsity=preserve_weight_sparsity,
                engine=engine,
                group=group,
                device_option=gc,
            )
            if do_dequantize or do_quantize_weight or do_prepack_weight:
                # When quantized weight is provided, we can't rescale the
                # output dynamically by looking at the range of output of each
                # batch, so here we provide the range of output observed from
                # fp32 reference implementation
                dnnlowp_utils.add_quantization_param_args(
                    conv, outputs[0][0], preserve_activation_sparsity)
            net.Proto().op.extend([conv])

            if do_dequantize:
                dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"],
                                                 engine="DNNLOWP",
                                                 device_option=gc)
                net.Proto().op.extend([dequantize])

            run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine,
                           order, gc, outputs)

        check_quantized_results_close(outputs,
                                      symmetric=preserve_activation_sparsity)