def get_conv_bias_desc(x): """Create a bias tensor descriptor.""" desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor( desc, libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'], _dtypes[x.dtype], 1, x.size, 1, 1) return Auto(desc, libcudnn.cudnnDestroyTensorDescriptor)
def get_tensor_desc(x, h, w, form='CUDNN_TENSOR_NCHW'): """Create a tensor descriptor for given settings.""" n = x.shape[0] if len(x.shape) >= 1 else 1 c = x.size // (n * h * w) desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(desc, libcudnn.cudnnTensorFormat[form], _dtypes[x.dtype], n, c, h, w) return Auto(desc, libcudnn.cudnnDestroyTensorDescriptor)
def get_tensor_desc(x, h, w, form='CUDNN_TENSOR_NCHW'): """Create a tensor descriptor for given settings.""" n = x.shape[0] c = x.size // (n * h * w) desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor( desc, libcudnn.cudnnTensorFormat[form], _dtypes[x.dtype], n, c, h, w) return Auto(desc, libcudnn.cudnnDestroyTensorDescriptor)
def __init__(self, shape, dtype, fmt=libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']): self.ptr = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(self.ptr, fmt, dtype, shape[0], shape[1], shape[2], shape[3])
def benchmark_conv(kw, kh, bsz): start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(): end.record() end.synchronize() return end.time_since(start) n_input = bsz filters_in = 3 filters_out = 64 height_in = 224 width_in = 224 height_filter = kh width_filter = kw pad_h = 3 pad_w = 3 vertical_stride = 1 horizontal_stride = 1 upscalex = 1 upscaley = 1 alpha = 1.0 beta = 1.0 # Input tensor X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in) .astype(np.float32)) # Filter tensor filters = gpuarray.to_gpu(np.random.rand(filters_out, filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode) # Get output dimensions (first two values are n_input and filters_out) _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim( conv_desc, X_desc, filters_desc) # Output tensor Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32) y_desc = libcudnn.cudnncreatetensordescriptor() libcudnn.cudnnsettensor4ddescriptor(y_desc, tensor_format, data_type, n_input, filters_out, height_output, width_output) # Get pointers to GPU memory X_data = ctypes.c_void_p(int(X.gpudata)) filters_data = ctypes.c_void_p(int(filters.gpudata)) Y_data = ctypes.c_void_p(int(Y.gpudata)) # Perform convolution algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0) # print("Cudnn algorithm = %d" % algo.value) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo) ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr)) libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) start_bench() for i in range(10): libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) ms = end_bench() ws_ptr = None libcudnn.cudnnDestroyTensorDescriptor(X_desc) libcudnn.cudnnDestroyTensorDescriptor(Y_desc) libcudnn.cudnnDestroyFilterDescriptor(filters_desc) libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc) return ms / 10
numlayers = 2 inputmode = 0 direction = 0 mode = 0 datatype = 0 handle = libcudnn.cudnnCreate() rnndesc = libcudnn.cudnnCreateRNNDescriptor() dropoutdesc = libcudnn.cudnnCreateDropoutDescriptor() cudnnSetDropoutDescriptor(dropoutdesc, handle, 0, 0, 0, 0) libcudnn.cudnnSetRNNDescriptor(rnndesc, hiddensize, seqlength, numlayers, dropoutdesc, inputmode, direction, mode, datatype) xdescs = [libcudnn.cudnnCreateTensorDescriptor() for _ in xrange(seqlength)] [ libcudnn.cudnnSetTensorNdDescriptor(xdesc, 0, 3, [inputsize, minibatch, seqlength]) for xdesc in xdescs ] hxdesc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensorNdDescriptor(hxdesc, 0, 3, [hiddensize, minibatch, numlayers]) cxdesc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensorNdDescriptor(cxdesc, 0, 3, [hiddensize, minibatch, numlayers]) paramssize = libcudnn.cudnnGetRNNParamsSize(handle, rnndesc, xdescs)
upscaley = 1 alpha = 1.0 beta = 1.0 # Input tensor X = gpuarray.to_gpu( np.random.rand(n_input, filters_in, height_in, width_in).astype(np.float32)) # Filter tensor filters = gpuarray.to_gpu( np.random.rand(filters_out, filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, tensor_format, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode, data_type)
start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] conv_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] fwd_pref = libcudnn.cudnnConvolutionFwdPreference['CUDNN_CONVOLUTION_FWD_NO_WORKSPACE'] # CUDNN_CONVOLUTION_FWD_NO_WORKSPACE # CUDNN_CONVOLUTION_FWD_PREFER_FASTEST # N C K D H W T R S pad str
horizontal_stride = 1 upscalex = 1 upscaley = 1 alpha = 1.0 beta = 1.0 # Input tensor X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in) .astype(np.float32)) # Filter tensor filters = gpuarray.to_gpu(np.random.rand(filters_out, filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode) # Get output dimensions (first two values are n_input and filters_out)
def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] conv_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] fwd_pref = libcudnn.cudnnConvolutionFwdPreference[ 'CUDNN_CONVOLUTION_FWD_NO_WORKSPACE'] # CUDNN_CONVOLUTION_FWD_NO_WORKSPACE # CUDNN_CONVOLUTION_FWD_PREFER_FASTEST
numlayers = 2 inputmode = 0 direction = 0 mode = 0 datatype = 0 handle = libcudnn.cudnnCreate() rnndesc = libcudnn.cudnnCreateRNNDescriptor() dropoutdesc = libcudnn.cudnnCreateDropoutDescriptor() cudnnSetDropoutDescriptor(dropoutdesc, handle, 0, 0, 0, 0) libcudnn.cudnnSetRNNDescriptor(rnndesc, hiddensize, seqlength, numlayers, dropoutdesc, inputmode, direction, mode, datatype) xdescs = [libcudnn.cudnnCreateTensorDescriptor() for _ in xrange(seqlength)] [libcudnn.cudnnSetTensorNdDescriptor(xdesc, 0, 3, [inputsize, minibatch, seqlength]) for xdesc in xdescs] hxdesc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensorNdDescriptor(hxdesc, 0, 3, [hiddensize, minibatch, numlayers]) cxdesc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensorNdDescriptor(cxdesc, 0, 3, [hiddensize, minibatch, numlayers]) paramssize = libcudnn.cudnnGetRNNParamsSize(handle, rnndesc, xdescs) wdesc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilterNdDescriptor(wdesc, 0, 0, 3, [paramssize, 1, 1]) ydescs = [libcudnn.cudnnCreateTensorDescriptor() for _ in xrange(seqlength)] [libcudnn.cudnnSetTensorNdDescriptor(ydesc, 0, 3, [hiddensize, minibatch, seqlength]) for ydesc in ydescs]