def test_fake_quant_quant_per_channel_bias(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConv3d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=True, quant_desc_weight=QuantDescriptor(axis=(0))) test_input = torch.randn(8, _NUM_IN_CHANNELS, 8, 8, 8) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) weight_copy = quant_conv_object.weight.clone() quant_weight = tensor_quant.fake_tensor_quant( weight_copy, torch.max(torch.abs(weight_copy).view(_NUM_OUT_CHANNELS, -1), dim=1, keepdim=True)[0].view(_NUM_OUT_CHANNELS, 1, 1, 1, 1)) out1 = F.conv3d(quant_input, quant_weight, bias=quant_conv_object.bias) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_cuda_ext(self): x_np = np.random.rand(1023).astype('float32') x_torch = torch.Tensor(x_np).cuda() for num_bits in [3, 4, 5, 7, 8, 11]: for unsigned in [True, False]: test_utils.compare(cuda_ext.fake_tensor_quant( x_torch, torch.max(torch.abs(x_torch)), num_bits, unsigned), tensor_quant.fake_tensor_quant( x_torch, torch.max(torch.abs(x_torch)), num_bits, unsigned), rtol=0, atol=0) # Test fp16 x_np_fp16 = np.random.rand(1023).astype('float16') x_torch_fp16 = torch.Tensor(x_np_fp16).cuda().half() test_utils.compare( cuda_ext.fake_tensor_quant(x_torch_fp16, torch.max(torch.abs(x_torch_fp16))), tensor_quant.fake_tensor_quant(x_torch_fp16, torch.max(torch.abs(x_torch_fp16))), rtol=0, atol=0)
def test_fake_quant_per_channel_bias(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConv2d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=True, quant_desc_weight=tensor_quant. QUANT_DESC_8BIT_CONV2D_WEIGHT_PER_CHANNEL) test_input = torch.randn(16, _NUM_IN_CHANNELS, 16, 16) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) weight_copy = quant_conv_object.weight.clone() quant_weight = tensor_quant.fake_tensor_quant( weight_copy, torch.max(torch.abs(weight_copy).view(_NUM_OUT_CHANNELS, -1), dim=1, keepdim=True)[0].view(_NUM_OUT_CHANNELS, 1, 1, 1)) out1 = F.conv2d(quant_input, quant_weight, bias=quant_conv_object.bias) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_fake_quant_per_channel_bias(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConvTranspose3d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=True, quant_desc_weight=tensor_quant. QUANT_DESC_8BIT_CONVTRANSPOSE3D_WEIGHT_PER_CHANNEL) test_input = torch.randn(2, _NUM_IN_CHANNELS, 2, 2, 2) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) weight_copy = quant_conv_object.weight.clone() amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2, 3, 4)) quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax) out1 = F.conv_transpose3d(quant_input, quant_weight, bias=quant_conv_object.bias) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_fake_quant_per_tensor(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConv1d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False, quant_desc_weight=QuantDescriptor()) test_input = torch.randn(16, _NUM_IN_CHANNELS, 16) quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input))) weight_copy = quant_conv_object.weight.clone() quant_weight = tensor_quant.fake_tensor_quant(weight_copy, torch.max(torch.abs(weight_copy))) out1 = F.conv1d(quant_input, quant_weight) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def _compute_amax_mse(self, stride, start_bin): """Returns amax that minimizes MSE of the collected histogram""" # If calibrator hasn't collected any data, return none if self._calib_bin_edges is None and self._calib_hist is None: return None counts = torch.from_numpy(self._calib_hist[:]).float() edges = torch.from_numpy(self._calib_bin_edges[:]).float() centers = (edges[1:] + edges[:-1]) / 2 mses = [] arguments = [] for i in range(start_bin, len(centers), stride): amax = centers[i] quant_centers = fake_tensor_quant(centers, amax, self._num_bits, self._unsigned) mse = ((quant_centers - centers)**2 * counts).mean() mses.append(mse) arguments.append(i) logging.debug("mses={}".format(mses)) argmin = np.argmin(mses) calib_amax = centers[arguments[argmin]] return calib_amax
def test_per_channel_scale(self): """ fake_tensor_quant performs per channel quantization """ x_np = np.random.rand(15, 15, 64, 128).astype('float32') x_torch = torch.Tensor(x_np).cuda() # Pytorch filter layout seems to be KCRS, reduce max to shape [K, 1, 1, 1] to test per channel scale # Shrink max a little, so that clip behavior is tested amax_x_np = 0.9 * np.max(np.abs(x_np), axis=(1, 2, 3), keepdims=True) # Pytorch's max function doesn't support reduces multiple axis, and returns (max, argmax) tuple, # so it has to be reduced by multiple torch.max amax_x_torch = 0.9 * torch.max(torch.max( torch.max(x_torch, dim=1, keepdim=True)[0], dim=2, keepdim=True)[0], dim=3, keepdim=True)[0] quant_x_np = test_utils.quant_np(x_np, amax_x_np, fake=True) quant_x_torch = tensor_quant.fake_tensor_quant(x_torch, amax_x_torch) # Pytorch numerics is not the same as numpy, results will be off a little # np.testing.assert_array_equal(quant_x_torch.cpu().numpy(), quant_x_np) np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(), quant_x_np, decimal=2) if verbose: mismatches = np.where( np.abs(quant_x_torch.cpu().numpy() - quant_x_np) >= 1e-5) print("Mismatches:") print(" Original: ", x_np[mismatches]) print(" numpy: ", quant_x_np[mismatches]) print(" Pytorch: ", quant_x_torch.cpu().numpy()[mismatches])
def test_overflow_fp16(self): x_torch = torch.randn(1023).cuda().half() quant_x_torch = tensor_quant.fake_tensor_quant( x_torch, torch.tensor(1e-4).cuda().half(), 8, False) assert not (torch.isinf(quant_x_torch).any() or torch.isnan(quant_x_torch).any())
def quantize_by_range_fused(x_tuple, num_bits): """Quantize multiple torch tensors by combined range to num_bits with symmetric zero-mean quantizer.""" # compute aggregate amax across all tensors amax = max([x.abs().max() for x in x_tuple]) # quantize each tensor with the aggregate amax x_q_tuple = tuple( tensor_quant.fake_tensor_quant(x, amax, num_bits) for x in x_tuple) return x_q_tuple
def test_simple_run(self): """Quantizer calls fake_tensor_quant by default""" x = torch.randn(3, 7).cuda() amax_x = torch.max(torch.abs(x)) fn_quant_x = tensor_quant.fake_tensor_quant(x, amax_x) quantizer = tensor_quantizer.TensorQuantizer() module_quant_x = quantizer(x) np.testing.assert_array_equal(fn_quant_x.cpu().numpy(), module_quant_x.cpu().numpy())
def test_clip_gradient(self): x = torch.randn(3, 7, requires_grad=True).cuda() x.retain_grad() amax = x.abs().max() / 2 x_in_range = (-amax <= x) * (x <= amax) quant_x = tensor_quant.fake_tensor_quant(x, amax, 8) loss = torch.sum((quant_x - 0.5)**2) loss.backward() np.testing.assert_array_equal(x.grad.cpu().numpy() != 0, x_in_range.cpu().numpy())
def test_per_tensor_scale(self): """ fake_tensor_quant matches numpy quantization """ x_np = np.random.rand(13).astype('float32') print(x_np) x_torch = torch.Tensor(x_np).cuda() quant_x_np = test_utils.quant_np(x_np, np.max(np.abs(x_np)), fake=True) quant_x_torch = tensor_quant.fake_tensor_quant( x_torch, torch.max(torch.abs(x_torch))) np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(), quant_x_np)
def test_unsigned(self): x_np = np.random.rand(1023).astype('float32') x_torch = torch.Tensor(x_np).cuda() quant_x_np = test_utils.quant_np(x_np, np.max(np.abs(x_np)), num_bits=9, fake=True) quant_x_torch = tensor_quant.fake_tensor_quant( x_torch, torch.max(torch.abs(x_torch)), 8, True) np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(), quant_x_np)
def test_fake_quant_per_channel(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConvTranspose1d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False, quant_desc_weight=QuantDescriptor(axis=(1))) test_input = torch.randn(16, _NUM_IN_CHANNELS, 16) quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input))) weight_copy = quant_conv_object.weight.clone() amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2)) quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax) out1 = F.conv_transpose1d(quant_input, quant_weight) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_fake_quant_per_tensor(self): """quantize everything, activations will scaled per tensor in ALL cases""" size_in = 255 size_out = 257 quant_linear_object = quant_linear.QuantLinear( size_in, size_out, bias=False, quant_desc_weight=tensor_quant.QuantDescriptor()) test_input = torch.randn(32, size_in) weight_copy = quant_linear_object.weight.clone() quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) quant_weight = tensor_quant.fake_tensor_quant( weight_copy, torch.max(torch.abs(weight_copy))) out1 = F.linear(quant_input, quant_weight) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_backward(self): """ fake_tensor_quant implements straight through estimator on the backward pass """ x = torch.randn(3, 7, requires_grad=True).cuda() labels = torch.randint(6, (3, )).type(torch.LongTensor).cuda() quant_x = tensor_quant.fake_tensor_quant(x, torch.max(torch.abs(x)), 7) x.retain_grad() quant_x.retain_grad() criterion = torch.nn.CrossEntropyLoss().cuda() loss = criterion(quant_x, labels) loss.backward() np.testing.assert_array_equal(quant_x.grad.cpu().numpy(), x.grad.cpu().numpy())
def test_input_fake_quant(self): quant_pooling_object = quant_pooling.QuantAdaptiveAvgPool3d( output_size=3) test_input = torch.randn(5, 5, 5, 5, dtype=torch.double) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) out1 = F.adaptive_avg_pool3d(quant_input, 3) out2 = quant_pooling_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_input_fake_quant(self): quant_pooling_object = quant_pooling.QuantMaxPool2d(kernel_size=3, stride=1) test_input = torch.randn(1, 5, 5, 5, dtype=torch.double) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) out1 = F.max_pool2d(quant_input, 3, 1, 0, 1, False, False) out2 = quant_pooling_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_cuda_ext_with_axis(self): x_np = np.random.rand(3, 4, 5, 6).astype('float32') x_torch = torch.Tensor(x_np).cuda() # amax along axis 1 amax_torch = torch.tensor([0.8, 0.9, 0.7, 0.6], device="cuda") for num_bits in [3, 4, 5, 7, 8, 11]: for unsigned in [True, False]: cuda_ext_out = cuda_ext.fake_tensor_quant_with_axis( x_torch, amax_torch, 1, num_bits, unsigned) pytorch_out = tensor_quant.fake_tensor_quant( x_torch, amax_torch.view(1, -1, 1, 1), num_bits, unsigned) test_utils.compare(cuda_ext_out, pytorch_out, rtol=0, atol=0)
def test_fake_quant_per_channel(self): """quantize everything, activations will scaled per tensor in ALL cases""" size_in = 255 size_out = 257 quant_linear_object = quant_linear.QuantLinear( size_in, size_out, bias=False, quant_desc_weight=tensor_quant. QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW) test_input = torch.randn(32, size_in) weight_copy = quant_linear_object.weight.clone() quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) quant_weight = tensor_quant.fake_tensor_quant( weight_copy, torch.max(torch.abs(weight_copy), dim=1, keepdim=True)[0]) out1 = F.linear(quant_input, quant_weight) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_full_range(self): """ fake_tensor_quant uses the full integer range when narrow=False """ x_np = np.random.rand(1023).astype('float32') x_torch = torch.Tensor(x_np).cuda() amax = np.max(np.abs(x_np)) quant_x_np = test_utils.quant_np(x_np, amax, num_bits=9, fake=True, narrow_range=False) quant_x_torch = tensor_quant.fake_tensor_quant( x_torch, torch.max(torch.abs(x_torch)), 8, True, False) np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(), quant_x_np)
def test_fake_quant_input(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConv1d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False) quant_conv_object.weight_quantizer.disable() test_input = torch.randn(20, _NUM_IN_CHANNELS, 50) quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input))) out1 = F.conv1d(quant_input, quant_conv_object.weight) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_test_input_fake_per_tensor(self): size_in = 255 size_out = 257 quant_linear_object = quant_linear.QuantLinear(size_in, size_out, bias=False) quant_linear_object.weight_quantizer.disable() test_input = torch.randn(32, size_in) weight_copy = quant_linear_object.weight.clone() quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) out1 = F.linear(quant_input, weight_copy) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_fake_quant_per_tensor(self): quant_instancenorm_object = quant_instancenorm.QuantInstanceNorm1d( NUM_CHANNELS, affine=True, quant_desc_input=QuantDescriptor()) test_input = torch.randn(8, NUM_CHANNELS, 128) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input))) out1 = quant_instancenorm_object(test_input) out2 = F.instance_norm(quant_input, quant_instancenorm_object.running_mean, quant_instancenorm_object.running_var, quant_instancenorm_object.weight, quant_instancenorm_object.bias) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_input_variable_bits(self): # Repeat checking the output for variable number of bits to QuantDescriptor for bits in [2, 4, 6]: quant_desc_input = tensor_quant.QuantDescriptor(num_bits=bits) quant_pooling.QuantMaxPool2d.set_default_quant_desc_input( quant_desc_input) quant_pooling_object = quant_pooling.QuantMaxPool2d(kernel_size=3, stride=1) test_input = torch.randn(1, 5, 5, 5, dtype=torch.double) quant_input = tensor_quant.fake_tensor_quant( test_input, torch.max(torch.abs(test_input)), bits) out1 = F.max_pool2d(quant_input, 3, 1, 0, 1, False, False) out2 = quant_pooling_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_weight_fake_per_tensor(self): with torch.cuda.device(0): size = 256 quant_linear_object = quant_linear.QuantLinear( size, size, bias=False, quant_desc_weight=tensor_quant.QuantDescriptor(axis=None)) quant_linear_object.input_quantizer.disable() test_input = torch.randn(size, size) weight_copy = quant_linear_object.weight.clone() quant_weight = tensor_quant.fake_tensor_quant( weight_copy, torch.max(torch.abs(weight_copy))) out1 = F.linear(test_input, quant_weight) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def test_weight_fake_quant_per_channel(self): kernel_size = 3 quant_conv_object = quant_conv.QuantConvTranspose2d( _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False, quant_desc_weight=tensor_quant.QUANT_DESC_8BIT_CONVTRANSPOSE2D_WEIGHT_PER_CHANNEL) quant_conv_object.input_quantizer.disable() test_input = torch.randn(16, _NUM_IN_CHANNELS, 256, 256) weight_copy = quant_conv_object.weight.clone() amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2, 3)) quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax) out1 = F.conv_transpose2d(test_input, quant_weight) out2 = quant_conv_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
def _quant_forward(self, inputs): """Quantized forward pass.""" if self._learn_amax: inputs = self.clip(inputs) amax = torch.max(-self.clip.clip_value_min, self.clip.clip_value_max).detach() else: amax = self._get_amax(inputs) if self._fake_quant: if not TensorQuantizer.use_fb_fake_quant: outputs = fake_tensor_quant(inputs, amax, self._num_bits, self._unsigned, self._narrow_range) else: outputs = self._fb_fake_quant(inputs, amax) else: outputs, self._scale = tensor_quant(inputs, amax, self._num_bits, self._unsigned) return outputs
def copy_state_and_quantize_fused(dst, src, num_bits): """Copy src to dst, quantize all 'weight' entries to num_bits using the aggregate amax.""" src_state_dict = src.state_dict() dst_state_dict = dict() # compute aggregate amax across all weight tensors amax = 0 for key in src_state_dict: if 'weight' in key: amax = max(amax, src_state_dict[key].abs().max()) # quantize each weight tensor with the aggregate amax for key in src_state_dict: if 'weight' in key: dst_state_dict[key] = tensor_quant.fake_tensor_quant( src_state_dict[key], amax, num_bits) else: dst_state_dict[key] = src_state_dict[key].clone() dst.load_state_dict(dst_state_dict)
def test_weight_fake_per_channel(self): size_in = 255 size_out = 257 quant_linear_object = quant_linear.QuantLinear( size_in, size_out, bias=False, quant_desc_weight=tensor_quant. QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW) quant_linear_object.input_quantizer.disable() test_input = torch.randn(32, size_in) weight_copy = quant_linear_object.weight.clone() amax = quant_utils.reduce_amax(weight_copy, axis=1, keepdims=True) quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax) out1 = F.linear(test_input, quant_weight) out2 = quant_linear_object(test_input) np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())