示例#1
0
    def __init__(self, config, name="Convolution"):
        super().__init__(config, name)
        self.output = None

        self.W = self.load_tensor(config, 0)

        self.alpha = 1.0
        self.beta = 0.0

        self.in_desc = None
        self.out_desc = None

        self.num_filter_maps = self.W.shape[0]
        self.num_filter_channels = self.W.shape[1]

        self.bias = self.load_tensor(config, 1, shape=(1, self.num_filter_maps, 1, 1))

        # assert(self.bias.shape[0] == self.num_filter_maps)
        # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1))
        # print(self.bias.shape)
        self.b_desc = self.bias.get_cudnn_tensor_desc()

        self.filt_desc = libcudnn.cudnnCreateFilterDescriptor()
        print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype])
        print("FILT:", self.W.shape, self.num_filter_maps, self.num_filter_channels, self.kH, self.kW)
        libcudnn.cudnnSetFilter4dDescriptor(self.filt_desc, 
                gputensor.np_2_cudnn_dtype[self.W.dtype], self.num_filter_maps,
                self.num_filter_channels, self.kH, self.kW)

        # print("B:", self.bias.shape)
        # self.bias_desc = 
        self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
        libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH, self.padW,
                self.dH, self.dW, 1, 1, self.convolution_mode)
示例#2
0
    def __init__(self, config, name="Convolution"):
        super().__init__(config, name)
        self.output = None

        self.W = self.load_tensor(config, 0)

        self.alpha = 1.0
        self.beta = 0.0

        self.in_desc = None
        self.out_desc = None

        self.num_filter_maps = self.W.shape[0]
        self.num_filter_channels = self.W.shape[1]

        self.bias = self.load_tensor(config,
                                     1,
                                     shape=(1, self.num_filter_maps, 1, 1))

        # assert(self.bias.shape[0] == self.num_filter_maps)
        # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1))
        # print(self.bias.shape)
        self.b_desc = self.bias.get_cudnn_tensor_desc()

        self.filt_desc = libcudnn.cudnnCreateFilterDescriptor()
        print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype])
        print("FILT:", self.W.shape, self.num_filter_maps,
              self.num_filter_channels, self.kH, self.kW)
        libcudnn.cudnnSetFilter4dDescriptor(
            self.filt_desc, gputensor.np_2_cudnn_dtype[self.W.dtype],
            self.num_filter_maps, self.num_filter_channels, self.kH, self.kW)

        # print("B:", self.bias.shape)
        # self.bias_desc =
        self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
        libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH,
                                                 self.padW, self.dH, self.dW,
                                                 1, 1, self.convolution_mode)
示例#3
0
def benchmark_conv(kw, kh, bsz):

    start, end = (drv.Event(), drv.Event())

    def start_bench():
        start.record()

    def end_bench():
        end.record()
        end.synchronize()
        return end.time_since(start)
    n_input = bsz

    filters_in = 3
    filters_out = 64
    height_in = 224
    width_in = 224
    height_filter = kh
    width_filter = kw
    pad_h = 3
    pad_w = 3
    vertical_stride = 1
    horizontal_stride = 1
    upscalex = 1
    upscaley = 1
    alpha = 1.0
    beta = 1.0

    # Input tensor
    X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in)
        .astype(np.float32))

    # Filter tensor
    filters = gpuarray.to_gpu(np.random.rand(filters_out,
        filters_in, height_filter, width_filter).astype(np.float32))

    # Descriptor for input
    X_desc = libcudnn.cudnnCreateTensorDescriptor()
    libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type,
        n_input, filters_in, height_in, width_in)

    # Filter descriptor
    filters_desc = libcudnn.cudnnCreateFilterDescriptor()
    libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out,
        filters_in, height_filter, width_filter)

    # Convolution descriptor
    conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
        vertical_stride, horizontal_stride, upscalex, upscaley,
        convolution_mode)

    # Get output dimensions (first two values are n_input and filters_out)
    _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim(
        conv_desc, X_desc, filters_desc)

    # Output tensor
    Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32)
    y_desc = libcudnn.cudnncreatetensordescriptor()
    libcudnn.cudnnsettensor4ddescriptor(y_desc, tensor_format, data_type, n_input,
        filters_out, height_output, width_output)

    # Get pointers to GPU memory
    X_data = ctypes.c_void_p(int(X.gpudata))
    filters_data = ctypes.c_void_p(int(filters.gpudata))
    Y_data = ctypes.c_void_p(int(Y.gpudata))

    # Perform convolution
    algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc,
        filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0)

    # print("Cudnn algorithm = %d" % algo.value)

    ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo)
    ws_ptr  = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
    ws_data = ctypes.c_void_p(int(ws_ptr))

    libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
        filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
        Y_desc, Y_data)
    start_bench()

    for i in range(10):
        libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
            filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
            Y_desc, Y_data)

    ms = end_bench()

    ws_ptr = None
    libcudnn.cudnnDestroyTensorDescriptor(X_desc)
    libcudnn.cudnnDestroyTensorDescriptor(Y_desc)
    libcudnn.cudnnDestroyFilterDescriptor(filters_desc)
    libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)

    return ms / 10
示例#4
0
def get_conv2d_desc(pad, stride, mode=_default_conv_mode):
    """Create a 2d convolution descriptor."""
    desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(desc, pad[0], pad[1], stride[0],
                                             stride[1], 1, 1, mode)
    return Auto(desc, libcudnn.cudnnDestroyConvolutionDescriptor)
示例#5
0
def get_conv2d_desc(pad, stride, mode=_default_conv_mode):
    """Create a 2d convolution descriptor."""
    desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(
        desc, pad[0], pad[1], stride[0], stride[1], 1, 1, mode)
    return Auto(desc, libcudnn.cudnnDestroyConvolutionDescriptor)
# Descriptor for input
X_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input,
                                    filters_in, height_in, width_in)

# Filter descriptor
filters_desc = libcudnn.cudnnCreateFilterDescriptor()
libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, tensor_format,
                                    filters_out, filters_in, height_filter,
                                    width_filter)

# Convolution descriptor
conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
                                         vertical_stride, horizontal_stride,
                                         upscalex, upscaley, convolution_mode,
                                         data_type)

# Get output dimensions (first two values are n_input and filters_out)
_, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim(
    conv_desc, X_desc, filters_desc)

# Output tensor
Y = gpuarray.empty((n_input, filters_out, height_output, width_output),
                   np.float32)
Y_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(Y_desc, tensor_format, data_type, n_input,
                                    filters_out, height_output, width_output)

# Get pointers to GPU memory
X_data = ctypes.c_void_p(int(X.gpudata))
示例#7
0
    cuO = ng.empty(dimO[::-1], dtype=np.float32)
    cuI[:] = 2 * (.5 - ng.rand())
    cuF[:] = 2 * (.5 - ng.rand())
    cuE[:] = 2 * (.5 - ng.rand())

    #print drv.mem_get_info()

    I_data = ctypes.c_void_p(int(cuI.gpudata))
    F_data = ctypes.c_void_p(int(cuF.gpudata))
    O_data = ctypes.c_void_p(int(cuO.gpudata))
    E_data = ctypes.c_void_p(int(cuE.gpudata))
    B_data = ctypes.c_void_p(int(cuB.gpudata))
    U_data = ctypes.c_void_p(int(cuU.gpudata))


    libcudnn.cudnnSetConvolution2dDescriptor(C_desc, pad_h, pad_w, str_h, str_w, 1, 1, conv_mode)
    libcudnn.cudnnSetTensor4dDescriptor(I_desc, NCHW_fmt, cu_dtype, N, C, H, W)
    libcudnn.cudnnSetTensor4dDescriptor(B_desc, NCHW_fmt, cu_dtype, N, C, H, W)
    libcudnn.cudnnSetTensor4dDescriptor(O_desc, NCHW_fmt, cu_dtype, N, K, P, Q)
    libcudnn.cudnnSetTensor4dDescriptor(E_desc, NCHW_fmt, cu_dtype, N, K, P, Q)
    libcudnn.cudnnSetFilter4dDescriptor(F_desc, cu_dtype, K, C, R, S)
    libcudnn.cudnnSetFilter4dDescriptor(U_desc, cu_dtype, K, C, R, S)

    algo    = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn, I_desc, F_desc, C_desc, O_desc, fwd_pref, 0)
    ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn, I_desc, F_desc, C_desc, O_desc, algo)

    #print algo.value, ws_size.value

    ws_ptr  = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
    ws_data = ctypes.c_void_p(int(ws_ptr))
    filters_in, height_filter, width_filter).astype(np.float32))

# Descriptor for input
X_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type,
    n_input, filters_in, height_in, width_in)

# Filter descriptor
filters_desc = libcudnn.cudnnCreateFilterDescriptor()
libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out,
    filters_in, height_filter, width_filter)

# Convolution descriptor
conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
    vertical_stride, horizontal_stride, upscalex, upscaley,
    convolution_mode)

# Get output dimensions (first two values are n_input and filters_out)
_, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim(
    conv_desc, X_desc, filters_desc)

# Output tensor
Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32)
Y_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(Y_desc, tensor_format, data_type, n_input,
    filters_out, height_output, width_output)

# Get pointers to GPU memory
X_data = ctypes.c_void_p(int(X.gpudata))
filters_data = ctypes.c_void_p(int(filters.gpudata))
示例#9
0
    cuU = ng.empty(dimF[::-1], dtype=np.float32)
    cuO = ng.empty(dimO[::-1], dtype=np.float32)
    cuI[:] = 2 * (.5 - ng.rand())
    cuF[:] = 2 * (.5 - ng.rand())
    cuE[:] = 2 * (.5 - ng.rand())

    #print drv.mem_get_info()

    I_data = ctypes.c_void_p(int(cuI.gpudata))
    F_data = ctypes.c_void_p(int(cuF.gpudata))
    O_data = ctypes.c_void_p(int(cuO.gpudata))
    E_data = ctypes.c_void_p(int(cuE.gpudata))
    B_data = ctypes.c_void_p(int(cuB.gpudata))
    U_data = ctypes.c_void_p(int(cuU.gpudata))

    libcudnn.cudnnSetConvolution2dDescriptor(C_desc, pad_h, pad_w, str_h,
                                             str_w, 1, 1, conv_mode)
    libcudnn.cudnnSetTensor4dDescriptor(I_desc, NCHW_fmt, cu_dtype, N, C, H, W)
    libcudnn.cudnnSetTensor4dDescriptor(B_desc, NCHW_fmt, cu_dtype, N, C, H, W)
    libcudnn.cudnnSetTensor4dDescriptor(O_desc, NCHW_fmt, cu_dtype, N, K, P, Q)
    libcudnn.cudnnSetTensor4dDescriptor(E_desc, NCHW_fmt, cu_dtype, N, K, P, Q)
    libcudnn.cudnnSetFilter4dDescriptor(F_desc, cu_dtype, K, C, R, S)
    libcudnn.cudnnSetFilter4dDescriptor(U_desc, cu_dtype, K, C, R, S)

    algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
        cudnn, I_desc, F_desc, C_desc, O_desc, fwd_pref, 0)
    ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
        cudnn, I_desc, F_desc, C_desc, O_desc, algo)

    #print algo.value, ws_size.value

    ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0