def test_groupwise_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, nbits_in_non_outlier, share_col_buffer, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group X_min = -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, nbits_in_non_outlier=nbits_in_non_outlier, engine=engine, group=group, quantize_groupwise=1, in_scale=x_q_param.scale, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs)
def test_groupwise_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[..., 1] = W_min + 128 # "zeros" for g in range(group): W[g * output_channels_per_group, 0, 0, 0] = W_min W[g * output_channels_per_group + 1, 0, 0, 0] = W_max if not preserve_weight_sparsity: W[g * output_channels_per_group:(g + 1) * output_channels_per_group, ] += g if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_conv_acc16_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_quantize_weight = ("DNNLOWP" in engine and weight_quantized and len(outputs) > 0) if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) net.Proto().op.extend([int8_given_tensor_fill]) # Bias X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity) int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) net.Proto().op.extend([int8_bias_tensor_fill]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_q" if do_quantize_weight else "W", "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, weight_quantized, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, fuse_relu, output_packed_bias, gc, dc, ): # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min ) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0] = X_min if batch_size != 0: X[0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * (W_max - W_min) + W_min ) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max, ) b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [("FC", "")] if fuse_relu: op_engine_list += [("Int8FCRelu", "DNNLOWP")] else: op_engine_list += [ ("FC", "DNNLOWP"), ("FC", "DNNLOWP_16"), ("Int8FC", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity ) w_q_param = None if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed", "B_q32"] if do_dequantize and output_packed_bias else ["W_packed"], preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( fc, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([fc]) if fuse_relu and "DNNLOWP" not in engine: net.Relu(["Y"], "Y") if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, None, gc, outputs ) if output_packed_bias and do_prepack_weight and do_dequantize: bias_int32 = self.ws.blobs["B_q32"].fetch() if do_quantize_weight: np.testing.assert_equal(bias_int32[0], np.round(b / (x_q_param.scale * w_q_param.scale))) np.testing.assert_equal(bias_int32[0].dtype, np.int32) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_fully_connected_acc16_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, gc, dc, ): # X and W have scale 1, so exactly represented after quantization # This was made sure by having at least one 0 and one 255 for unsigned # 8-bit tensors, and at least one -128 and one 127 for signed 8-bit # tensors. # Since fbgemm_acc16 accumulates to 16-bit, To avoid overflow, we use # small numbers except for those 0, 255, -128, and 127, for this test # We also make sure 255, -128, or 127 are not multiplied together by # putting them in different input channels and the corresponding input # channel in other matrix is 0. # For example, we put 255 in input channel 1 in X, so we make the # corresponding input channel in W all zeros. X_min = -77 X_max = X_min + 255 X = np.round(np.random.rand(batch_size, input_channels) * 4 + X_min) X = X.astype(np.float32) X[:, 0] = X_min X[0, 1] = X_max W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand(output_channels, input_channels) * 4 - 2 + W_min + 128) W = W.astype(np.float32) W[0, 0] = W_min W[1, 0] = W_max W[:, 1] = W_min + 128 # No input quantization error in bias b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ACC16"), ("Int8FC", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([quantize]) fc = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], dequantize_output=(0 if do_dequantize else 1), engine=engine, device_option=gc, ) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, None, gc, outputs) check_quantized_results_close(outputs)
def test_dnnlowp_conv_acc16_outlier( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, prepack_weight, nbits_in_non_outlier, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume(size >= dilation * (kernel - 1) + 1) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X = np.random.rand(batch_size, size, size, input_channels) * 4 + X_min X = np.round(X).astype(np.float32) X[..., 0] = X_min if batch_size != 0: X[0, 0, 0, 1] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = (np.random.rand(output_channels, kernel, kernel, input_channels_per_group) * 4 - 2 + W_min + 128) W = np.round(W).astype(np.float32) W[0, 0, 0, 0] = W_min W[1, 0, 0, 0] = W_max W[..., 1] = W_min + 128 # "zeros" if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.round(np.random.randn(output_channels)).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP_ACC16"), ("Int8Conv", "DNNLOWP_ACC16"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_quantize_weight = "DNNLOWP" in engine and weight_quantized do_prepack_weight = "DNNLOWP" in engine and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine="DNNLOWP", device_option=gc, ) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, nbits_in_non_outlier=nbits_in_non_outlier, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, nbits_in_non_outlier=nbits_in_non_outlier, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_dequantize or do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine="DNNLOWP", device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, prepack_weight, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=True, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = hardcode_scale_zp.choose_quantization_params( X_min, X_max) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, in_scale=x_q_param.scale, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) if do_dequantize or do_prepack_weight: # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_depthwise_3x3_conv( self, stride, size, group, batch_size, prepack_weight, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, quantize_groupwise, relu, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=quantize_groupwise, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] if relu: op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("Int8ConvRelu", "DNNLOWP"), ] else: op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params( X_min, X_max, preserve_activation_sparsity ) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, quantize_groupwise=quantize_groupwise, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=quantize_groupwise, device_option=gc, ) if do_dequantize or do_prepack_weight: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) elif relu: relu_op = core.CreateOperator( "Relu", ["Y"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([relu_op]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, order, gc, outputs ) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_groupwise_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, True, # group-wise ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, quantize_groupwise=1, device_option=gc, ) # groupwise quantization only works with static quantization # so we need to set quantization parameters dnnlowp_utils.add_quantization_param_args(conv, outputs[0][0]) net.Proto().op.extend([conv]) dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator("Relu", ["Y"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([relu]) run_conv_or_fc(self, None, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs)
def test_dnnlowp_conv_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, weight_quantized, prepack_weight, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, gc, dc, ): assume(group == 1 or dilation == 1) assume((not prepack_weight) or order == "NHWC") X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("Conv", "DNNLOWP"), ("Conv", "DNNLOWP_16"), ("Int8Conv", "DNNLOWP"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine # If output scale/zp aren't set, it gets computed from ref fp32 op # in DNNLOWP, which isn't possible when we quantize input weights. # Make sure atleast one output is collected to compute output # scale/zp. do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity ) if do_quantize_weight: int8_given_tensor_fill, w_q_param = dnnlowp_utils.create_int8_given_tensor_fill( W, "W_q", preserve_weight_sparsity ) init_net.Proto().op.extend([int8_given_tensor_fill]) # Bias int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill( b, "b_q", x_q_param, w_q_param ) init_net.Proto().op.extend([int8_bias_tensor_fill]) if do_prepack_weight: inputs = ["W_q" if do_quantize_weight else "W"] if do_dequantize: inputs += ["b_q" if do_quantize_weight else "b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else ("W_q" if do_quantize_weight else "W"), "b_q" if do_quantize_weight else "b", ], ["Y_q" if do_dequantize else "Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, device_option=gc, ) if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each # batch, so here we provide the range of output observed from # fp32 reference implementation dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity ) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) run_conv_or_fc( self, init_net, net, X, W, b, op_type, engine, order, gc, outputs ) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)
def test_dnnlowp_conv_relu_int( self, stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, share_col_buffer, gc, dc, ): assume(group == 1 or dilation == 1) X, W, b = generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op_engine_list = [ ("Conv", ""), ("ConvRelu", "DNNLOWP"), ("ConvRelu", "DNNLOWP_16"), ("Int8ConvRelu", "DNNLOWP"), ] for op_type, engine in op_engine_list: net = core.Net("test_net") if "DNNLOWP" in engine: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], engine=engine, device_option=gc ) net.Proto().op.extend([quantize]) conv = core.CreateOperator( op_type, ["X_q", "W", "b"], ["Y_q"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, shared_buffer=(1 if share_col_buffer else 0), group=group, device_option=gc, ) net.Proto().op.extend([conv]) dequantize = core.CreateOperator( "Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([dequantize]) else: conv = core.CreateOperator( op_type, ["X", "W", "b"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, shared_buffer=(1 if share_col_buffer else 0), engine=engine, group=group, device_option=gc, ) net.Proto().op.extend([conv]) relu = core.CreateOperator( "Relu", ["Y"], ["Y"], engine=engine, device_option=gc ) net.Proto().op.extend([relu]) run_conv_or_fc( self, None, net, X, W, b, op_type, engine, order, gc, outputs ) check_quantized_results_close(outputs)
def test_rowwise_dnnlowp_fully_connected_int( self, input_channels, output_channels, batch_size, in_quantized, out_quantized, prepack_weight, gc, dc, ): # X has scale 1, so exactly represented after quantization X_min = -77 X_max = X_min + 255 X = np.round( np.random.rand(batch_size, input_channels) * (X_max - X_min) + X_min) X = X.astype(np.float32) # input channels 0 and 1 are all X_min to avoid overflow from vpmaddubsw # when multiplied with W_min and W_max X[:, 0:2] = X_min if batch_size != 0: X[0, 2] = X_max # Each row of W has scale 1 but with different offset, so row-wise # quantization shouldn't have any input quantization error. W = np.zeros((output_channels, input_channels)) W = W.astype(np.float32) for i in range(output_channels): W_min = -100 + i W_max = W_min + 255 W[i, :] = np.round( np.random.rand(input_channels) * (W_max - W_min) + W_min) W[i, 0] = W_min W[i, 1] = W_max # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, 1, X, X_min, X_max, W[i:i + 1, ], W_min, W_max, ) if i % 2 == 0: W[i, :] = (W[i, :] - W_min) * 2 + W_min b = np.random.randn(output_channels).astype(np.float32) Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] op_engine_list = [ ("FC", ""), ("FC", "DNNLOWP_ROWWISE"), ("FC", "DNNLOWP_ROWWISE_16"), ("Int8FC", "DNNLOWP_ROWWISE"), ] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine and in_quantized do_dequantize = "DNNLOWP" in engine and out_quantized do_prepack_weight = engine == "DNNLOWP_ROWWISE" and prepack_weight if do_quantize: quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine=engine, device_option=gc) net.Proto().op.extend([quantize]) X_min = 0 if X.size == 0 else X.min() X_max = 0 if X.size == 0 else X.max() x_q_param = dnnlowp_utils.choose_quantization_params(X_min, X_max) if do_prepack_weight: inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8FCPackWeight", inputs, ["W_packed"], in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) fc = core.CreateOperator( op_type, [ "X_q" if do_quantize else "X", "W_packed" if do_prepack_weight else "W", "b", ], ["Y_q" if do_dequantize else "Y"], dequantize_output=not do_dequantize, engine=engine, device_option=gc, ) if do_prepack_weight: # When pre-packed quantized weight is provided, we can't rescale # the output dynamically by looking at the range of output of # each batch, so here we provide the range of output observed # from fp32 reference implementation dnnlowp_utils.add_quantization_param_args(fc, outputs[0][0]) net.Proto().op.extend([fc]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, None, gc, outputs) check_quantized_results_close(outputs)
def test_dnnlowp_depthwise_3x3x3_conv( self, stride_0, stride_1, stride_2, size, group, batch_size, prepack_weight, fuse_relu, share_col_buffer, preserve_activation_sparsity, preserve_weight_sparsity, quantize_groupwise, gc, dc, ): pad = 1 kernel = 3 dilation = 1 input_channels_per_group = 1 output_channels_per_group = 1 order = "NHWC" X, W, b = generate_convnd_inputs( (stride_0, stride_1, stride_2), (pad, ) * 3, (kernel, ) * 3, (dilation, ) * 3, (size, ) * 3, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=quantize_groupwise, preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, ) Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) outputs = [] op = "ConvRelu" if fuse_relu else "Conv" op_engine_list = [(op, ""), (op, "DNNLOWP"), ("Int8" + op, "DNNLOWP")] for op_type, engine in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") do_quantize = "DNNLOWP" in engine do_dequantize = "DNNLOWP" in engine do_prepack_weight = engine == "DNNLOWP" and prepack_weight if do_quantize: quantize = core.CreateOperator( "Quantize", ["X"], ["X_q"], preserve_activation_sparsity=preserve_activation_sparsity, engine=engine, device_option=gc, ) net.Proto().op.extend([quantize]) if do_prepack_weight: x_q_param = dnnlowp_utils.choose_quantization_params( X.min(), X.max(), preserve_activation_sparsity) inputs = ["W"] if do_dequantize: inputs += ["b"] pack = core.CreateOperator( "Int8ConvPackWeight", inputs, ["W_packed"], group=group, quantize_groupwise=quantize_groupwise, preserve_weight_sparsity=preserve_weight_sparsity, in_scale=x_q_param.scale, engine=engine, ) init_net.Proto().op.extend([pack]) conv = core.CreateOperator( op_type, ["X_q" if do_quantize else "X", "W", "b"], ["Y_q" if do_dequantize else "Y"], strides=[stride_0, stride_1, stride_2], kernels=[kernel] * 3, dilations=[dilation] * 3, pads=[pad] * (3 * 2), order=order, shared_buffer=(1 if share_col_buffer else 0), preserve_activation_sparsity=preserve_activation_sparsity, preserve_weight_sparsity=preserve_weight_sparsity, engine=engine, group=group, quantize_groupwise=quantize_groupwise, device_option=gc, ) if do_dequantize or do_prepack_weight: dnnlowp_utils.add_quantization_param_args( conv, outputs[0][0], preserve_activation_sparsity) net.Proto().op.extend([conv]) if do_dequantize: dequantize = core.CreateOperator("Dequantize", ["Y_q"], ["Y"], engine=engine, device_option=gc) net.Proto().op.extend([dequantize]) run_conv_or_fc(self, init_net, net, X, W, b, op_type, engine, order, gc, outputs) check_quantized_results_close(outputs, symmetric=preserve_activation_sparsity)