def test_forward_compression(self): dtype = torch.float if torch.cuda.is_available(): print("cuda is available") device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W = 128, 16, 32, 32 K, HH, WW = 16, 3, 3 x = torch.randn(N, C, H, W, dtype=dtype, device=device) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device) start = time.time() torch.nn.functional.conv2d(input=x, weight=y, stride=1) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfftFunction() start = time.time() conv.forward(ctx=None, input=x, filter=y, stride=1, args=Arguments(stride_type=StrideType.STANDARD, preserved_energy=100)) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch speedup is: {speedup} X")
def __init__(self, in_channels=None, out_channels=None, kernel_size=None, stride=1, padding=0, dilation=None, groups=None, bias=False, weight_value=None, bias_value=None, is_manual=tensor([0]), args=Arguments(), out_size=None): super(Conv2dfft, self).__init__(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias, weight_value=weight_value, bias_value=bias_value, is_manual=is_manual, args=args, out_size=out_size)
def test_forward_correctness(self): """ exec: CUDA convStandard time: 0.004092693328857422 convFFT time: 0.19140386581420898 Pytorch speedup is: 46.76721426074799 X convStandard time: 0.0039272308349609375 convFFT time: 0.03870105743408203 Pytorch speedup is: 9.854541039339486 X exec: SGEMM convStandard time: 0.004120588302612305 convFFT time: 0.029807090759277344 Pytorch speedup is: 7.233697853381936 X """ dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W = 128, 16, 32, 32 K, HH, WW = 16, 3, 3 x = torch.randn(N, C, H, W, dtype=dtype, device=device) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device) repetitions = 1 start = time.time() for repeat in range(repetitions): convStandard = torch.nn.functional.conv2d(input=x, weight=y, stride=1) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfftFunction() start = time.time() for repeat in range(repetitions): convFFT = conv.forward(ctx=None, input=x, filter=y, stride=1, args=Arguments( stride_type=StrideType.STANDARD, conv_exec_type=self.conv_exec_type, )) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch speedup is: {speedup} X") np.testing.assert_array_almost_equal( x=convStandard.cpu().detach().numpy(), y=convFFT.cpu().detach().numpy(), decimal=3, err_msg="The expected array x and computed y are not almost equal." )
def test_AutogradForwardWithCompression(self): # A single input 2D map. x = tensor([[[[1.0, 2.0, 3.0], [3.0, 4.0, 1.0], [1., 2., 1.]]]]) # A single filter. y = tensor([[[[1.0, 2.0], [3.0, 2.0]]]]) b = tensor([0.0]) conv = Conv2dfftAutograd(weight_value=y, bias=b, args=Arguments(index_back=1, next_power2=False, preserve_energy=100)) result = conv.forward(input=x) # expect = np.array([[[[21.5, 22.0], [17.5, 13.]]]]) expect = np.array([[[[21.75, 21.75], [18.75, 13.75]]]]) np.testing.assert_array_almost_equal( x=expect, y=result.detach().numpy(), err_msg="The expected array x and computed y are not almost equal")
def test_FunctionForwardCompression(self): # A single 2D input map. x = tensor([[[[1.0, 2.0, 3.0], [3.0, 4.0, 1.0], [1., 2., 1.]]]], device=self.device, dtype=self.dtype) # A single filter. y = tensor([[[[1.0, 2.0], [3.0, 2.0]]]], device=self.device, dtype=self.dtype) b = tensor([0.0], device=self.device, dtype=self.dtype) conv = Conv2dfftFunction() result = conv.forward(ctx=None, input=x, filter=y, bias=b, args=Arguments(index_back=1, preserve_energy=100)) # expect = np.array([[[[21.5, 22.0], [17.5, 13.]]]]) expect = np.array([[[[21.75, 21.75], [18.75, 13.75]]]]) np.testing.assert_array_almost_equal( x=expect, y=get_numpy(result), err_msg="The expected array x and computed y are not almost equal.")
def test_FunctionForwardCompressionConvFFTIndexBackCifar10LeNet1stLayer( self): start = time.time() x = cifar10_image print("shape of the input image: ", x.size()) y = cifar10_lenet_filter print("shape of the filter: ", y.size()) b = torch.tensor([0.0]) # get the expected results from numpy correlate expected_result_tensor = F.conv2d(input=x, weight=y, bias=b) N, C, H, W = x.size() K, C, HH, WW = y.size() out_size = H - HH + 1 fft_size = H + out_size - 1 half_fft_size = fft_size // 2 + 1 fft_numel = half_fft_size * fft_size * C # for compress_rate in range(1, fft_numel, 10): for index_back in range(1, 2): print("index back: ", index_back) conv = Conv2dfft(weight_value=y, bias_value=b, args=Arguments( index_back=index_back, preserve_energy=100, is_debug=True, next_power2=False, compress_type=CompressType.STANDARD)) result = conv.forward(input=x) # print("actual result: ", result) result = result.float() abs_error = torch.sum( torch.abs(result - expected_result_tensor)).item() print("abs error: ", abs_error) expected_total = torch.sum( torch.abs(expected_result_tensor) + torch.abs(result)) relative_error = 100.0 * abs_error / expected_total print("relative error: ", relative_error) # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100) print(f"absolute divergence for index back,{index_back}," f"absolute error,{abs_error}," f"relative error (%),{relative_error}") print("elapsed: ", time.time() - start)
def test_FunctionForwardCompressionConvFFTPreserveEnergyCifar10LeNet1stLayer( self): print("\n") x = cifar10_image print("shape of the input image: ", x.size()) y = cifar10_lenet_filter print("shape of the filter: ", y.size()) b = torch.tensor([0.0]) # get the expected results from numpy correlate # print("expected_result_numpy: ", expected_result_numpy) preserved_energies = [100., 99., 98.5, 98., 97., 96., 95., 94., 93., 92., 91., 90., 89., 87., 85., 80., 70., 60., 50., 40., 10., 5., 1.] # preserved_energies = [1.0] # compress_rates = [1, 2, 4, 8, 16, 32, 64, 128, 256] expected_result_tensor = F.conv2d(input=x, weight=y, bias=b) for preserve_energy in preserved_energies: conv = Conv2dfft(weight_value=y, bias_value=b, args=Arguments( preserve_energy=preserve_energy, index_back=0, is_debug=True, next_power2=True, compress_type=CompressType.STANDARD)) result = conv.forward(input=x) # print("actual result: ", result) result = result.float() abs_error = torch.sum( torch.abs(result - expected_result_tensor)).item() expected_total = torch.sum(torch.abs(expected_result_tensor)) relative_error = abs_error / expected_total * 100.0 # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100) print( f"absolute divergence for preserved energy,{preserve_energy}" f",absolute error,{abs_error}," f"relative error (%),{relative_error}")
return model class Args(object): pass if __name__ == "__main__": dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) args = Arguments() args.in_channels = 3 # args.conv_type = "FFT2D" args.conv_type = ConvType.STANDARD2D args.compress_rate = None args.preserve_energy = None args.is_debug = False args.next_power2 = True args.compress_type = CompressType.STANDARD args.tensor_type = TensorType.FLOAT32 args.num_classes = 10 args.min_batch_size = 16 args.test_batch_size = 16 batch_size = 16 inputs = torch.randn(batch_size,
def test_FunctionBackwardWithPooling(self): x = np.array([[[[1., 2., 3., 4., 5.], [6., 7., 8., 1., 2.], [2., 3., 1., 0., 1.], [1., 2., 3., -1., -2.], [0., 1., 3., 1., 2.] ]]]) y = np.array([[[[2., 1.], [-1.0, 2.0]]]]) b = np.array([0.0]) # Full result. conv_param = {'pad': 0, 'stride': 1} full_expected_result, cache = conv_forward_naive(x=x, w=y, b=b, conv_param=conv_param) print() print("full expected result: ", full_expected_result) # get the expected results from numpy correlate # expected_result = np.array([[[[10.103396, 12.630585, 11.697527], # [12.558281, 13.923859, 11.561422], # [11.473415, 11.409614, 8.187342]]]]) # expected_result = np.array([[[[11.2787, 14.2694, 12.6907], # [14.0552, 15.6585, 12.3298], # [12.0275, 11.8809, 7.7573]]]]) expected_result = np.array([[[[12.2992, 13.6678, 10.92], [15.9293, 16.679, 11.7282], [13.3441, 13.755, 8.7778]]]]) conv = Conv2dfftFunction() x_torch = torch.tensor(data=x, requires_grad=True) y_torch = torch.tensor(data=y, requires_grad=True) b_torch = torch.tensor(data=b, requires_grad=True) out_size = 3 result_torch = conv.forward(ctx=None, input=x_torch, filter=y_torch, bias=b_torch, out_size=out_size, args=Arguments()) result = result_torch.detach().numpy() np.testing.assert_array_almost_equal( x=np.array(expected_result), y=result, decimal=4, err_msg="Manual: Expected x is different from computed y.") # Prevent any interference/overlap of variables in manual and auto # differentiation. x_torch_auto = torch.tensor(data=x, requires_grad=True) y_torch_auto = torch.tensor(data=y, requires_grad=True) b_torch_auto = torch.tensor(data=b, requires_grad=True) convAuto = Conv2dfftAutograd(weight_value=y_torch_auto, bias=b_torch_auto, out_size=out_size) resultAuto = convAuto.forward(input=x_torch_auto) np.testing.assert_array_almost_equal( x=np.array(expected_result), y=resultAuto.cpu().detach().numpy(), decimal=4, err_msg="Auto: Expected x is different from computed y.") dout_np = np.array([[[[0.1, -0.2, 0.3], [-0.1, 0.1, 0.2], [-0.2, 1.1, -1.2]]]]) dout = tensor(dout_np) resultAuto.backward(dout) print("x auto grad: ", x_torch_auto.grad) print("y auto grad: ", y_torch_auto.grad) print("b auto grad: ", b_torch_auto.grad) # get the expected result from the backward pass expected_dx, expected_dw, expected_db = \ conv_backward_naive(dout.numpy(), cache) result_torch.backward(dout) # approximate_expected_dx = np.array( # [[[[0.0306, 0.1016, 0.1293, 0.0976, 0.0249], # [0.0815, 0.1438, 0.1321, 0.0534, -0.0463], # [0.1171, 0.1399, 0.0813, -0.0245, -0.1154], # [0.1164, 0.0923, 0.0066, -0.0904, -0.1420], # [0.0799, 0.0287, -0.0482, -0.1058, -0.1104]]]]) # approximate_expected_dx = np.array( # [[[[0.0004, 0.1056, 0.1608, 0.1246, 0.0241], # [0.0604, 0.1825, 0.1858, 0.0676, -0.0829], # [0.1250, 0.1951, 0.1164, -0.0518, -0.1829], # [0.1456, 0.1338, 0.0051, -0.1437, -0.2005], # [0.1066, 0.0448, -0.0645, -0.1389, -0.1225]]]]) approximate_expected_dx = np.array([[[ [-0.0148, 0.0503, 0.1306, 0.1655, 0.1288], [0.1054, 0.1526, 0.1158, 0.0227, -0.0567], [0.1963, 0.2130, 0.0595, -0.1488, -0.2549], [0.1895, 0.1861, 0.0040, -0.2197, -0.3165], [0.0901, 0.0920, -0.0089, -0.1367, -0.1952]]]]) print("manual torch grad: ", x_torch.grad) # Are the gradients correct? np.testing.assert_array_almost_equal( x=approximate_expected_dx, y=x_torch.grad, decimal=4, err_msg="Expected x is different from computed y.") self._check_delta2D(actual_result=x_torch.grad, accurate_expected_result=expected_dx, delta=5.4) print("Expected fully correct dw: ", expected_dw) print("actual result for dw from y_torch.grad: ", y_torch.grad) # approximate_expected_dw = np.array([[[[0.844089, 1.41447], # [1.221608, 1.32085]]]]) # approximate_expected_dw = np.array([[[[1.1816, 1.8317], # [1.5589, 1.4568]]]]) approximate_expected_dw = np.array([[[[1.2042, 2.0410], [1.6021, 1.6371]]]]) np.testing.assert_array_almost_equal( x=approximate_expected_dw, y=y_torch.grad, decimal=4, err_msg="Expected x is different from computed y.") self._check_delta2D(actual_result=y_torch.grad, accurate_expected_result=expected_dw, delta=4.0) np.testing.assert_array_almost_equal(b_torch.grad, expected_db)
'''VGG11/13/16/19 in Pytorch.''' import torch.nn as nn from cnns.nnlib.pytorch_layers.conv_picker import Conv from cnns.nnlib.utils.arguments import Arguments from cnns.nnlib.utils.general_utils import ConvType from cnns.nnlib.utils.general_utils import CompressType import torch args = Arguments() args.conv_type = ConvType.FFT2D args.compress_rate = 80.0 args.dtype = torch.float args.preserve_energy = None args.next_power2 = True args.is_debug = False args.compress_type = CompressType.STANDARD def conv3x3(in_planes, out_planes, compress_rate = args.compress_rate, stride=1, padding=1, args=args): """3x3 convolution with padding""" # return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, # padding=1, bias=False) args.compress_rate = compress_rate return Conv(kernel_sizes=[3], in_channels=in_planes, out_channels=[out_planes], strides=[stride], padding=[padding], args=args, is_bias=False).get_conv() cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
def __init__(self, in_channels=None, out_channels=None, kernel_size=None, stride=1, padding=0, dilation=None, groups=None, bias=False, weight_value=None, bias_value=None, is_manual=tensor([0]), args=Arguments(), out_size=None): """ 2D convolution using DCT. :param in_channels: (int) – Number of channels in the input series. :param out_channels: (int) – Number of channels produced by the convolution (equal to the number of filters in the given conv layer). :param kernel_size: (int) - Size of the convolving kernel (the width and height of the filter). :param stride: what is the stride for the convolution (the pattern for omitted values). :param padding: the padding added to the (top and bottom) and to the (left and right) of the input signal. :param dilation: (int) – Spacing between kernel elements. Default: 1 :param groups: (int) – Number of blocked connections from input channels to output channels. Default: 1 :param bias: (bool) - add bias or not :param compress_rate: how many frequency coefficients should be discarded :param preserve_energy: how much energy should be preserved in the input image. :param out_size: what is the expected output size of the operation (when compression is used and the out_size is smaller than the size of the input to the convolution, then the max pooling can be omitted and the compression in this layer can serve as the frequency-based (spectral) pooling. :param weight_value: you can provide the initial filter, i.e. filter weights of shape (F, C, HH, WW), where F - number of filters, C - number of channels, HH - height of the filter, WW - width of the filter :param bias_value: you can provide the initial value of the bias, of shape (F,) :param use_next_power2: should we extend the size of the input for the FFT convolution to the next power of 2. :param is_manual: to check if the backward computation of convolution was computed manually. Regarding the stride parameter: the number of pixels between adjacent receptive fields in the horizontal and vertical directions, we can generate the full output, and then remove the redundant elements according to the stride parameter. The more relevant method is to apply spectral pooling as a means to achieving the strided convolution. """ super(ConvDCT, self).__init__() self.args = args if dilation is not None and dilation > 1: raise NotImplementedError("dilation > 1 is not supported.") if groups is not None and groups > 1: raise NotImplementedError("groups > 1 is not supported.") self.is_weight_value = None # Was the filter value provided? if weight_value is None: self.is_weight_value = False if out_channels is None or in_channels is None or \ kernel_size is None: raise ValueError( "Either specify filter_value or provide all" "the required parameters (out_channels, " "in_channels and kernel_size) to generate the " "filter.") self.kernel_height, self.kernel_width = get_pair(kernel_size) if args.dtype is torch.float: weight = torch.randn(out_channels, in_channels, self.kernel_height, self.kernel_width, dtype=args.dtype) elif args.dtype is torch.half: weight = torch.randn(out_channels, in_channels, self.kernel_height, self.kernel_width).to(torch.half) else: raise Exception(f"Unknown dtype in args: {args.dtype}") self.weight = Parameter(weight) else: self.is_weight_value = True self.weight = weight_value out_channels = weight_value.shape[0] in_channels = weight_value.shape[1] self.kernel_height = weight_value.shape[2] self.kernel_width = weight_value.shape[3] self.is_bias_value = None # Was the bias value provided. if bias_value is None: self.is_bias_value = False if bias is True: self.bias = Parameter( torch.randn(out_channels, dtype=args.dtype)) else: self.register_parameter('bias', None) self.bias = None else: self.is_bias_value = True self.bias = bias_value self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.pad_H, self.pad_W = get_pair(value=padding, val_1_default=0, val2_default=0, name="padding") if self.pad_H != self.pad_W: raise Exception( "We only support a symmetric padding in the frequency domain.") self.stride = stride self.stride_type = args.stride_type self.stride_H, self.stride_W = get_pair(value=stride, val_1_default=None, val2_default=None, name="stride") if self.stride_H != self.stride_W: raise Exception( "We only support a symmetric striding in the frequency domain." ) self.is_manual = is_manual self.conv_index = ConvDCT.conv_index_counter ConvDCT.conv_index_counter += 1 self.out_size = out_size self.out_size_H, self.out_size_W = get_pair(value=out_size, val_1_default=None, val2_default=None, name="out_size") if self.out_size_H != self.out_size_W: raise Exception( "We only support a symmetric outputs in the frequency domain.") if args is None: self.compress_rate = None self.preserve_energy = None self.is_debug = False self.next_power2 = False self.is_debug = False self.compress_type = CompressType.STANDARD else: self.compress_rate = args.compress_rate self.preserve_energy = args.preserve_energy self.next_power2 = args.next_power2 self.is_debug = args.is_debug self.compress_type = args.compress_type self.reset_parameters()
def test_forward_pass_resnet18(self): """ total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.813918352127075 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.35197567939758 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 55.51149845123291 total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.736859083175659 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.84979581832886 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.26755166053772 global init time: 0.24471688270568848 global pad time: 4.250756025314331 (r)fft time: 8.754997730255127 conjugate time: 3.734828233718872 correlation time: 25.324009656906128 restore time (de-compress/concat output): 0.021800994873046875 i(r)fft time: 8.525353193283081 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.27733850479126 GPU mem: 2903 global init time: 0.2371835708618164 global pad time: 4.492943286895752 (r)fft time: 9.08437442779541 conjugate time: 3.8394811153411865 correlation time: 25.043412446975708 restore time (de-compress/concat output): 0.021334409713745117 i(r)fft time: 5.491833925247192 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.804604053497314 GPU mem: 2679 """ if not torch.cuda.is_available(): print("CUDA device is not available.") return device = torch.device("cuda") print("\ndevice used: ", str(device)) C = 3 # dtype = torch.float # random mini batch imitating cifar-10 # N, H, W = 128, 32, 32 # inputs = torch.randn(N, C, H, W, dtype=dtype, device=device, # requires_grad=True) args = Arguments() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.sample_count_limit = 10000 args.min_batch_size = 32 args.dataset_name = "cifar10" args.test_batch_size = args.min_batch_size args.network_type = NetworkType.ResNet18 from cnns.nnlib.datasets.cifar import get_cifar train_loader, test_loader, _, _ = get_cifar( args=args, dataset_name=args.dataset_name) repetition = 1 args.in_channels = C args.compress_rate = None args.preserve_energy = 100 args.is_debug = True args.next_power2 = True args.compress_type = CompressType.STANDARD args.tensor_type = TensorType.FLOAT32 args.num_classes = 10 args.test_batch_size = args.min_batch_size args.in_channels = C args.dtype = torch.float32 conv_exec_types = [ # (ConvType.STANDARD2D, ConvExecType.SERIAL), # (ConvType.FFT2D, ConvExecType.CUDA), (ConvType.FFT2D, ConvExecType.SGEMM), # (ConvType.FFT2D, ConvExecType.CUDA_SHARED_LOG), # (ConvType.FFT2D, ConvExecType.CUDA_DEEP), # (ConvType.FFT2D, ConvExecType.SERIAL), # (ConvType.FFT2D, ConvExecType.BATCH), ] for conv_type, conv_exec_type in conv_exec_types: args.conv_type = conv_type args.conv_exec_type = conv_exec_type model = resnet18(args=args) model.to(device) model.eval() start_eval = time.time() for _ in range(repetition): for inputs, _ in train_loader: inputs = inputs.to(device) outputs_standard = model(inputs) standard_time = time.time() - start_eval print(f"total time for ({conv_type}-{conv_exec_type}):" f" {standard_time}")
def test_forward_backward_performance(self): dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W, K, HH, WW, padding = 32, 3, 32, 32, 64, 3, 3, 0 natural_image = True if natural_image: x = cifar10_image[:, :1, :H, :W] x_new = x.expand(N, C, -1, -1).clone() # specifies new size del x print("x size: ", x_new.size()) x = x_new.to(device) x.requires_grad_(True) else: x = torch.randn(N, C, H, W, dtype=dtype, device=device, requires_grad=True) x_expect = x.clone().detach().requires_grad_(True) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device, requires_grad=True) y_expect = y.clone().detach().requires_grad_(True) print("input size: ", x.size()) print("filter size: ", y.size()) print("padding: ", padding) from .conv2D_fft import global_threshold repetitions = global_threshold print("repetitions: ", repetitions) preserve_energy = 80 print("preserve energy: ", preserve_energy) stride = 1 print("stride: ", stride) next_power2 = True print("next_power2: ", str(next_power2)) print("cuda exec type: ", self.conv_exec_type.name) compress_rate = 0.0 print("compress rate: ", compress_rate) # warm-up torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=stride, padding=padding) start = time.time() for _ in range(repetitions): convStandard = torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=stride, padding=padding) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfft(weight_value=y, stride=stride, bias=False, padding=padding, args=Arguments(stride_type=StrideType.STANDARD, min_batch_size=N, is_debug=True, preserved_energy=preserve_energy, next_power2=next_power2, conv_exec_type=self.conv_exec_type, compress_rate=compress_rate, compress_rates=[compress_rate])) # warm-up conv.forward(input=x) start = time.time() for _ in range(repetitions): convFFT = conv.forward(input=x) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch forward pass speedup is: {speedup}") if compress_rate == 0.0 and preserve_energy == 100: np.testing.assert_array_almost_equal( x=convStandard.cpu().detach().numpy(), y=convFFT.cpu().detach().numpy(), decimal=1, err_msg= "The expected array x and computed y are not almost equal.") dout = torch.randn(list(convStandard.size()), device=device, dtype=dtype) dout_clone = dout.clone() # warm-up convStandard.backward(dout, retain_graph=True) standard_back_time_start = time.time() for _ in range(repetitions): convStandard.backward(dout, retain_graph=True) standard_back_time = time.time() - standard_back_time_start print("standard back time: ", standard_back_time) # warm-up convFFT.backward(dout_clone, retain_graph=True) fft_back_time_start = time.time() for _ in range(repetitions): convFFT.backward(dout_clone, retain_graph=True) conv_fft_back_time = time.time() - fft_back_time_start assert conv.is_manual[0] == 1 print("conv fft back time: ", conv_fft_back_time) speedup = conv_fft_back_time / standard_back_time print(f"Pytorch speedup for backprop: {speedup}") full_pass_fft = convFFTtime + conv_fft_back_time print("full pass fft:", full_pass_fft) full_pass_pytorch = convStandardTime + standard_back_time print("full pass pytorch: ", full_pass_pytorch) speedup_full_pass = full_pass_fft / full_pass_pytorch print(f"Pytorch speedup for full pass: {speedup_full_pass}") if compress_rate == 0.0 and preserve_energy == 100: np.testing.assert_array_almost_equal( x.grad.cpu().detach().numpy(), x_expect.grad.cpu().detach().numpy(), decimal=1) np.testing.assert_array_almost_equal( y.grad.cpu().detach().numpy(), y_expect.grad.cpu().detach().numpy(), decimal=1)
def test_forward_backward(self): dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W = 128, 16, 32, 32 K, HH, WW = 16, 3, 3 x = torch.randn(N, C, H, W, dtype=dtype, device=device, requires_grad=True) x_expect = x.clone().detach().requires_grad_(True) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device, requires_grad=True) y_expect = y.clone().detach().requires_grad_(True) start = time.time() convStandard = torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=1) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfft(weight_value=y, stride=1, bias=False, args=Arguments(stride_type=StrideType.STANDARD)) start = time.time() convFFT = conv.forward(input=x) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch forward pass speedup is: {speedup} X") np.testing.assert_array_almost_equal( x=convStandard.cpu().detach().numpy(), y=convFFT.cpu().detach().numpy(), decimal=3, err_msg="The expected array x and computed y are not almost equal." ) dout = torch.randn(list(convStandard.size()), device=device, dtype=dtype) dout_clone = dout.clone() standard_back_time_start = time.time() convStandard.backward(dout) standard_back_time = time.time() - standard_back_time_start print("standard back time: ", standard_back_time) fft_back_time_start = time.time() convFFT.backward(dout_clone) conv_fft_back_time = time.time() - fft_back_time_start assert conv.is_manual[0] == 1 print("conv fft back time: ", conv_fft_back_time) speedup = conv_fft_back_time / standard_back_time print(f"Pytorch speedup for backprop: {speedup} X") np.testing.assert_array_almost_equal( x.grad.cpu().detach().numpy(), x_expect.grad.cpu().detach().numpy(), decimal=3) np.testing.assert_array_almost_equal( y.grad.cpu().detach().numpy(), y_expect.grad.cpu().detach().numpy(), decimal=3)
def test_forward_timing(self): """ device used: cuda x size: torch.Size([32, 3, 32, 32]) input size: torch.Size([32, 3, 32, 32]) filter size: torch.Size([64, 3, 3, 3]) padding: 0 repetitions: 1000 preserve energy: 100 next_power2: False cuda exec type: CUDA output size: torch.Size([32, 64, 30, 30]) PyTorch conv2D: 0.6601831912994385 compress rate: 80.0 conv FFT time: 16.30516004562378 Pytorch speedup: 24.697932726112484 """ dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") print("\nTorch CUDA is available") else: device = torch.device("cpu") print("device used: ", str(device)) # # 1st layer # N, C, H, W, K, HH, WW = 32, 3, 32, 32, 64, 3, 3 # 7th layer # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 256, 3, 3 # last layer # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 512, 3, 3 for N, C, H, W, K, HH, WW, padding in [ (32, 3, 32, 32, 64, 3, 3, 0), # (32, 3, 32, 32, 64, 3, 3, 1), # (32, 3, 32, 32, 64, 7, 7, 3), # (32, 64, 16, 16, 64, 3, 3, 1), # (32, 256, 4, 4, 256, 3, 3, 1), # (32, 512, 2, 2, 512, 3, 3, 1), ]: natural_image = True if natural_image: x = cifar10_image[:, :1, :H, :W] x_new = x.expand(N, C, -1, -1).clone() # specifies new size del x print("x size: ", x_new.size()) x = x_new.to(device) else: x = torch.randn(N, C, H, W, dtype=dtype, device=device) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device) print("input size: ", x.size()) print("filter size: ", y.size()) print("padding: ", padding) repetitions = 1000 print("repetitions: ", repetitions) preserve_energy = 100 print("preserve energy: ", preserve_energy) stride = 1 next_power2 = False print("next_power2: ", str(next_power2)) print("cuda exec type: ", self.conv_exec_type.name) # print("preserve energy: ", preserve_energy) # print("min_batch_size (equivalent to the batch slice for fft): ", N) # print("next power 2: ", next_power2) convStandard = torch.nn.Conv2d(in_channels=C, out_channels=K, kernel_size=(HH, WW), stride=stride, padding=padding) convStandard.to(device) out_standard = convStandard.forward(x) print("output size: ", out_standard.size()) start = time.time() for repeat in range(repetitions): convStandard.forward(x) convStandardTime = time.time() - start print("PyTorch conv2D: ", convStandardTime) # print("compress_rate, FFT conv2D:") # for compress_rate in range(0, 86, 5): for compress_rate in [80.0]: compress_rate = float(compress_rate) print("compress rate: ", compress_rate) conv = Conv2dfft(weight_value=y, stride=stride, padding=padding, args=Arguments( stride_type=StrideType.STANDARD, min_batch_size=N, is_debug=True, preserved_energy=preserve_energy, next_power2=next_power2, conv_exec_type=self.conv_exec_type, compress_rate=compress_rate, compress_rates=[compress_rate])) conv.to(device) start = time.time() for repeat in range(repetitions): conv.forward(input=x) convFFTtime = time.time() - start # print(compress_rate, ",", convFFTtime) print("conv FFT time: ", convFFTtime) # del conv speedup = convFFTtime / convStandardTime print(f"Pytorch speedup: {speedup}")