def __init__(self, config, name="Convolution"): super().__init__(config, name) self.output = None self.W = self.load_tensor(config, 0) self.alpha = 1.0 self.beta = 0.0 self.in_desc = None self.out_desc = None self.num_filter_maps = self.W.shape[0] self.num_filter_channels = self.W.shape[1] self.bias = self.load_tensor(config, 1, shape=(1, self.num_filter_maps, 1, 1)) # assert(self.bias.shape[0] == self.num_filter_maps) # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1)) # print(self.bias.shape) self.b_desc = self.bias.get_cudnn_tensor_desc() self.filt_desc = libcudnn.cudnnCreateFilterDescriptor() print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype]) print("FILT:", self.W.shape, self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) libcudnn.cudnnSetFilter4dDescriptor(self.filt_desc, gputensor.np_2_cudnn_dtype[self.W.dtype], self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) # print("B:", self.bias.shape) # self.bias_desc = self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH, self.padW, self.dH, self.dW, 1, 1, self.convolution_mode)
def __init__(self, config, name="Convolution"): super().__init__(config, name) self.output = None self.W = self.load_tensor(config, 0) self.alpha = 1.0 self.beta = 0.0 self.in_desc = None self.out_desc = None self.num_filter_maps = self.W.shape[0] self.num_filter_channels = self.W.shape[1] self.bias = self.load_tensor(config, 1, shape=(1, self.num_filter_maps, 1, 1)) # assert(self.bias.shape[0] == self.num_filter_maps) # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1)) # print(self.bias.shape) self.b_desc = self.bias.get_cudnn_tensor_desc() self.filt_desc = libcudnn.cudnnCreateFilterDescriptor() print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype]) print("FILT:", self.W.shape, self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) libcudnn.cudnnSetFilter4dDescriptor( self.filt_desc, gputensor.np_2_cudnn_dtype[self.W.dtype], self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) # print("B:", self.bias.shape) # self.bias_desc = self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH, self.padW, self.dH, self.dW, 1, 1, self.convolution_mode)
def benchmark_conv(kw, kh, bsz): start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(): end.record() end.synchronize() return end.time_since(start) n_input = bsz filters_in = 3 filters_out = 64 height_in = 224 width_in = 224 height_filter = kh width_filter = kw pad_h = 3 pad_w = 3 vertical_stride = 1 horizontal_stride = 1 upscalex = 1 upscaley = 1 alpha = 1.0 beta = 1.0 # Input tensor X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in) .astype(np.float32)) # Filter tensor filters = gpuarray.to_gpu(np.random.rand(filters_out, filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode) # Get output dimensions (first two values are n_input and filters_out) _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim( conv_desc, X_desc, filters_desc) # Output tensor Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32) y_desc = libcudnn.cudnncreatetensordescriptor() libcudnn.cudnnsettensor4ddescriptor(y_desc, tensor_format, data_type, n_input, filters_out, height_output, width_output) # Get pointers to GPU memory X_data = ctypes.c_void_p(int(X.gpudata)) filters_data = ctypes.c_void_p(int(filters.gpudata)) Y_data = ctypes.c_void_p(int(Y.gpudata)) # Perform convolution algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0) # print("Cudnn algorithm = %d" % algo.value) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo) ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr)) libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) start_bench() for i in range(10): libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) ms = end_bench() ws_ptr = None libcudnn.cudnnDestroyTensorDescriptor(X_desc) libcudnn.cudnnDestroyTensorDescriptor(Y_desc) libcudnn.cudnnDestroyFilterDescriptor(filters_desc) libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc) return ms / 10
def get_conv2d_desc(pad, stride, mode=_default_conv_mode): """Create a 2d convolution descriptor.""" desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(desc, pad[0], pad[1], stride[0], stride[1], 1, 1, mode) return Auto(desc, libcudnn.cudnnDestroyConvolutionDescriptor)
def get_conv2d_desc(pad, stride, mode=_default_conv_mode): """Create a 2d convolution descriptor.""" desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor( desc, pad[0], pad[1], stride[0], stride[1], 1, 1, mode) return Auto(desc, libcudnn.cudnnDestroyConvolutionDescriptor)
# Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, tensor_format, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode, data_type) # Get output dimensions (first two values are n_input and filters_out) _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim( conv_desc, X_desc, filters_desc) # Output tensor Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32) Y_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(Y_desc, tensor_format, data_type, n_input, filters_out, height_output, width_output) # Get pointers to GPU memory X_data = ctypes.c_void_p(int(X.gpudata))
cuO = ng.empty(dimO[::-1], dtype=np.float32) cuI[:] = 2 * (.5 - ng.rand()) cuF[:] = 2 * (.5 - ng.rand()) cuE[:] = 2 * (.5 - ng.rand()) #print drv.mem_get_info() I_data = ctypes.c_void_p(int(cuI.gpudata)) F_data = ctypes.c_void_p(int(cuF.gpudata)) O_data = ctypes.c_void_p(int(cuO.gpudata)) E_data = ctypes.c_void_p(int(cuE.gpudata)) B_data = ctypes.c_void_p(int(cuB.gpudata)) U_data = ctypes.c_void_p(int(cuU.gpudata)) libcudnn.cudnnSetConvolution2dDescriptor(C_desc, pad_h, pad_w, str_h, str_w, 1, 1, conv_mode) libcudnn.cudnnSetTensor4dDescriptor(I_desc, NCHW_fmt, cu_dtype, N, C, H, W) libcudnn.cudnnSetTensor4dDescriptor(B_desc, NCHW_fmt, cu_dtype, N, C, H, W) libcudnn.cudnnSetTensor4dDescriptor(O_desc, NCHW_fmt, cu_dtype, N, K, P, Q) libcudnn.cudnnSetTensor4dDescriptor(E_desc, NCHW_fmt, cu_dtype, N, K, P, Q) libcudnn.cudnnSetFilter4dDescriptor(F_desc, cu_dtype, K, C, R, S) libcudnn.cudnnSetFilter4dDescriptor(U_desc, cu_dtype, K, C, R, S) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn, I_desc, F_desc, C_desc, O_desc, fwd_pref, 0) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn, I_desc, F_desc, C_desc, O_desc, algo) #print algo.value, ws_size.value ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr))
filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode) # Get output dimensions (first two values are n_input and filters_out) _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim( conv_desc, X_desc, filters_desc) # Output tensor Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32) Y_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(Y_desc, tensor_format, data_type, n_input, filters_out, height_output, width_output) # Get pointers to GPU memory X_data = ctypes.c_void_p(int(X.gpudata)) filters_data = ctypes.c_void_p(int(filters.gpudata))
cuU = ng.empty(dimF[::-1], dtype=np.float32) cuO = ng.empty(dimO[::-1], dtype=np.float32) cuI[:] = 2 * (.5 - ng.rand()) cuF[:] = 2 * (.5 - ng.rand()) cuE[:] = 2 * (.5 - ng.rand()) #print drv.mem_get_info() I_data = ctypes.c_void_p(int(cuI.gpudata)) F_data = ctypes.c_void_p(int(cuF.gpudata)) O_data = ctypes.c_void_p(int(cuO.gpudata)) E_data = ctypes.c_void_p(int(cuE.gpudata)) B_data = ctypes.c_void_p(int(cuB.gpudata)) U_data = ctypes.c_void_p(int(cuU.gpudata)) libcudnn.cudnnSetConvolution2dDescriptor(C_desc, pad_h, pad_w, str_h, str_w, 1, 1, conv_mode) libcudnn.cudnnSetTensor4dDescriptor(I_desc, NCHW_fmt, cu_dtype, N, C, H, W) libcudnn.cudnnSetTensor4dDescriptor(B_desc, NCHW_fmt, cu_dtype, N, C, H, W) libcudnn.cudnnSetTensor4dDescriptor(O_desc, NCHW_fmt, cu_dtype, N, K, P, Q) libcudnn.cudnnSetTensor4dDescriptor(E_desc, NCHW_fmt, cu_dtype, N, K, P, Q) libcudnn.cudnnSetFilter4dDescriptor(F_desc, cu_dtype, K, C, R, S) libcudnn.cudnnSetFilter4dDescriptor(U_desc, cu_dtype, K, C, R, S) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( cudnn, I_desc, F_desc, C_desc, O_desc, fwd_pref, 0) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( cudnn, I_desc, F_desc, C_desc, O_desc, algo) #print algo.value, ws_size.value ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0