Exemplo n.º 1
0
    def configure(self, input):
        
        in_images = input.shape[0]
        in_channels = input.shape[1]
        in_height = input.shape[2]
        in_width = input.shape[3]

        assert(in_width >= self.kW)
        assert(in_height >= self.kH)

        out_width  = int((math.floor(1.0 * in_width - self.kW + 2*self.padW) / self.dW) + 1)
        out_height = int((math.floor(1.0 * in_height - self.kH + 2*self.padH) / self.dH) + 1)

        self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype ) 

        if self.pool_desc:
            libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc)
        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc)

        self.in_desc = input.get_cudnn_tensor_desc()
        self.out_desc = self.output.get_cudnn_tensor_desc()

        self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor()
        libcudnn.cudnnSetPooling2dDescriptor(self.pool_desc,
            libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"],
            # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"],
            self.kH, self.kW, self.padH, self.padW, self.dH, self.dW)
Exemplo n.º 2
0
    def configure(self, input):
        self.output = GPUTensor(input.shape, input.dtype)

        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr)

        self.in_desc = input.get_cudnn_tensor_desc()
        self.out_desc = self.output.get_cudnn_tensor_desc()
Exemplo n.º 3
0
    def configure(self, input):
        self.output = GPUTensor(input.shape, input.dtype)

        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr)

        self.in_desc = input.get_cudnn_tensor_desc()
        self.out_desc = self.output.get_cudnn_tensor_desc()
Exemplo n.º 4
0
    def configure(self, input):
        # print("Convolution::configure: input shape =", input.shape)

        in_images = input.shape[0]
        in_channels = input.shape[1]
        in_height = input.shape[2]
        in_width = input.shape[3]

        assert (in_channels == self.num_filter_channels)

        out_width = int((1.0 * in_width + 2 * self.padW - self.kW) / self.dW +
                        1)
        out_height = int((1.0 * in_height + 2 * self.padH - self.kH) /
                         self.dH + 1)

        self.output = GPUTensor(
            (in_images, self.num_filter_maps, out_height, out_width),
            input.dtype)
        # print("ONCV:", input.dtype, self.output.dtype)
        # print("Convolution::configure: output shape =", self.output.shape)

        # initialize cudnn descriptors
        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr)

        self.in_desc = input.get_cudnn_tensor_desc()

        # Get output dimensions (first two values are n_input and filters_out)
        _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim(
            self.conv_desc, self.in_desc.ptr, self.filt_desc)

        assert (out_width == out_width2)
        assert (out_height == out_height2)

        self.out_desc = self.output.get_cudnn_tensor_desc()

        # find best convolution algorithm
        self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
            context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc,
            self.out_desc.ptr, self.convolution_fwd_pref, 0)

        print("Convolution::configure: algo=%s" % str(self.algo.value))

        self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
            context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc,
            self.out_desc.ptr, self.algo)
        self.ws_ptr = drv.mem_alloc(
            self.ws_size.value) if self.ws_size.value > 0 else 0

        print("Convolution::configure: workspace size=%d" % self.ws_size.value)
Exemplo n.º 5
0
    def configure(self, input):
        # print("Convolution::configure: input shape =", input.shape)
        
        in_images = input.shape[0]
        in_channels = input.shape[1]
        in_height = input.shape[2]
        in_width = input.shape[3]

        assert(in_channels == self.num_filter_channels)
       
        out_width  = int((1.0 * in_width + 2*self.padW - self.kW) / self.dW + 1);
        out_height = int((1.0 * in_height + 2*self.padH - self.kH) / self.dH + 1);

        self.output = GPUTensor((in_images, self.num_filter_maps, out_height, out_width),
                input.dtype)
        # print("ONCV:", input.dtype, self.output.dtype)
        # print("Convolution::configure: output shape =", self.output.shape)
   
        # initialize cudnn descriptors
        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr)

        self.in_desc = input.get_cudnn_tensor_desc()

        # Get output dimensions (first two values are n_input and filters_out)
        _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim(
            self.conv_desc, self.in_desc.ptr, self.filt_desc)

        assert(out_width == out_width2)
        assert(out_height == out_height2)

        self.out_desc = self.output.get_cudnn_tensor_desc()
        
        # find best convolution algorithm
        self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(context.cudnn, self.in_desc.ptr,
            self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0)
 
        print("Convolution::configure: algo=%s" % str(self.algo.value))

        self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(context.cudnn, 
                self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo)
        self.ws_ptr  = drv.mem_alloc(self.ws_size.value) if self.ws_size.value > 0 else 0

        print("Convolution::configure: workspace size=%d" % self.ws_size.value)
Exemplo n.º 6
0
    def configure(self, input):

        in_images = input.shape[0]
        in_channels = input.shape[1]
        in_height = input.shape[2]
        in_width = input.shape[3]

        assert (in_width >= self.kW)
        assert (in_height >= self.kH)

        out_width = int((math.floor(1.0 * in_width - self.kW + 2 * self.padW) /
                         self.dW) + 1)
        out_height = int(
            (math.floor(1.0 * in_height - self.kH + 2 * self.padH) / self.dH) +
            1)

        self.output = GPUTensor(
            (in_images, in_channels, out_height, out_width), input.dtype)

        if self.pool_desc:
            libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc)
        if self.in_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.in_desc)
        if self.out_desc:
            libcudnn.cudnnDestroyTensorDescriptor(self.out_desc)

        self.in_desc = input.get_cudnn_tensor_desc()
        self.out_desc = self.output.get_cudnn_tensor_desc()

        self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor()
        libcudnn.cudnnSetPooling2dDescriptor(
            self.pool_desc,
            libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"],
            # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"],
            self.kH,
            self.kW,
            self.padH,
            self.padW,
            self.dH,
            self.dW)
Exemplo n.º 7
0
def benchmark_conv(kw, kh, bsz):

    start, end = (drv.Event(), drv.Event())

    def start_bench():
        start.record()

    def end_bench():
        end.record()
        end.synchronize()
        return end.time_since(start)
    n_input = bsz

    filters_in = 3
    filters_out = 64
    height_in = 224
    width_in = 224
    height_filter = kh
    width_filter = kw
    pad_h = 3
    pad_w = 3
    vertical_stride = 1
    horizontal_stride = 1
    upscalex = 1
    upscaley = 1
    alpha = 1.0
    beta = 1.0

    # Input tensor
    X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in)
        .astype(np.float32))

    # Filter tensor
    filters = gpuarray.to_gpu(np.random.rand(filters_out,
        filters_in, height_filter, width_filter).astype(np.float32))

    # Descriptor for input
    X_desc = libcudnn.cudnnCreateTensorDescriptor()
    libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type,
        n_input, filters_in, height_in, width_in)

    # Filter descriptor
    filters_desc = libcudnn.cudnnCreateFilterDescriptor()
    libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out,
        filters_in, height_filter, width_filter)

    # Convolution descriptor
    conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
    libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
        vertical_stride, horizontal_stride, upscalex, upscaley,
        convolution_mode)

    # Get output dimensions (first two values are n_input and filters_out)
    _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim(
        conv_desc, X_desc, filters_desc)

    # Output tensor
    Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32)
    y_desc = libcudnn.cudnncreatetensordescriptor()
    libcudnn.cudnnsettensor4ddescriptor(y_desc, tensor_format, data_type, n_input,
        filters_out, height_output, width_output)

    # Get pointers to GPU memory
    X_data = ctypes.c_void_p(int(X.gpudata))
    filters_data = ctypes.c_void_p(int(filters.gpudata))
    Y_data = ctypes.c_void_p(int(Y.gpudata))

    # Perform convolution
    algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc,
        filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0)

    # print("Cudnn algorithm = %d" % algo.value)

    ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo)
    ws_ptr  = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
    ws_data = ctypes.c_void_p(int(ws_ptr))

    libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
        filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
        Y_desc, Y_data)
    start_bench()

    for i in range(10):
        libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
            filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
            Y_desc, Y_data)

    ms = end_bench()

    ws_ptr = None
    libcudnn.cudnnDestroyTensorDescriptor(X_desc)
    libcudnn.cudnnDestroyTensorDescriptor(Y_desc)
    libcudnn.cudnnDestroyFilterDescriptor(filters_desc)
    libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)

    return ms / 10
Exemplo n.º 8
0
# Perform convolution
algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc,
                                                    filters_desc, conv_desc,
                                                    Y_desc,
                                                    convolution_fwd_pref, 0)

print("Cudnn algorithm = %d" % algo.value)

ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
    cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo)
ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
ws_data = ctypes.c_void_p(int(ws_ptr))

start_bench()

libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
                                 filters_desc, filters_data, conv_desc, algo,
                                 ws_data, ws_size.value, beta, Y_desc, Y_data)

end_bench("fprop")

ws_ptr = None

# Clean up
libcudnn.cudnnDestroyTensorDescriptor(X_desc)
libcudnn.cudnnDestroyTensorDescriptor(Y_desc)
libcudnn.cudnnDestroyFilterDescriptor(filters_desc)
libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)
libcudnn.cudnnDestroy(cudnn_context)
Exemplo n.º 9
0
    maxU = parU[0:1,0:1]

    maxo  = ng.max(abs(cuO - nlO.T), partial=parO, out=maxO).get()[0,0]
    maxb  = ng.max(abs(cuB - nlB.T), partial=parB, out=maxB).get()[0,0]
    maxu  = ng.max(abs(cuU - nlU.T), partial=parU, out=maxU).get()[0,0]

    meano = ng.mean(abs(cuO), partial=parO, out=maxO).get()[0,0]
    meanb = ng.mean(abs(cuB), partial=parB, out=maxB).get()[0,0]
    meanu = ng.mean(abs(cuU), partial=parU, out=maxU).get()[0,0]

    print "        maxerr   mean   pct"
    print "fprop: %7.5f %6.2f %5.3f" % (maxo, meano, 100*maxo/meano)
    print "bprop: %7.5f %6.2f %5.3f" % (maxb, meanb, 100*maxb/meanb)
    print "updat: %7.5f %6.2f %5.3f" % (maxu, meanu, 100*maxu/meanu)

    # free up memory from this layer before proceeding
    cuB  = cuU  = cuO  = None
    nlB  = nlU  = nlO  = None
    parO = parB = parU = maxO = maxB = maxU = None


libcudnn.cudnnDestroyTensorDescriptor(I_desc)
libcudnn.cudnnDestroyTensorDescriptor(O_desc)
libcudnn.cudnnDestroyFilterDescriptor(F_desc)
libcudnn.cudnnDestroyTensorDescriptor(E_desc)
libcudnn.cudnnDestroyTensorDescriptor(B_desc)
libcudnn.cudnnDestroyFilterDescriptor(U_desc)
libcudnn.cudnnDestroyConvolutionDescriptor(C_desc)

libcudnn.cudnnDestroy(cudnn)
Exemplo n.º 10
0
X_data = ctypes.c_void_p(int(X.gpudata))
filters_data = ctypes.c_void_p(int(filters.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))

# Perform convolution
algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc,
    filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0)

print("Cudnn algorithm = %d" % algo.value)

ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo)
ws_ptr  = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
ws_data = ctypes.c_void_p(int(ws_ptr))

start_bench()

libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
    filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
    Y_desc, Y_data)

end_bench("fprop")

ws_ptr = None

# Clean up
libcudnn.cudnnDestroyTensorDescriptor(X_desc)
libcudnn.cudnnDestroyTensorDescriptor(Y_desc)
libcudnn.cudnnDestroyFilterDescriptor(filters_desc)
libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)
libcudnn.cudnnDestroy(cudnn_context)
Exemplo n.º 11
0
    maxB = parB[0:1, 0:1]
    maxU = parU[0:1, 0:1]

    maxo = ng.max(abs(cuO - nlO.T), partial=parO, out=maxO).get()[0, 0]
    maxb = ng.max(abs(cuB - nlB.T), partial=parB, out=maxB).get()[0, 0]
    maxu = ng.max(abs(cuU - nlU.T), partial=parU, out=maxU).get()[0, 0]

    meano = ng.mean(abs(cuO), partial=parO, out=maxO).get()[0, 0]
    meanb = ng.mean(abs(cuB), partial=parB, out=maxB).get()[0, 0]
    meanu = ng.mean(abs(cuU), partial=parU, out=maxU).get()[0, 0]

    print "        maxerr   mean   pct"
    print "fprop: %7.5f %6.2f %5.3f" % (maxo, meano, 100 * maxo / meano)
    print "bprop: %7.5f %6.2f %5.3f" % (maxb, meanb, 100 * maxb / meanb)
    print "updat: %7.5f %6.2f %5.3f" % (maxu, meanu, 100 * maxu / meanu)

    # free up memory from this layer before proceeding
    cuB = cuU = cuO = None
    nlB = nlU = nlO = None
    parO = parB = parU = maxO = maxB = maxU = None

libcudnn.cudnnDestroyTensorDescriptor(I_desc)
libcudnn.cudnnDestroyTensorDescriptor(O_desc)
libcudnn.cudnnDestroyFilterDescriptor(F_desc)
libcudnn.cudnnDestroyTensorDescriptor(E_desc)
libcudnn.cudnnDestroyTensorDescriptor(B_desc)
libcudnn.cudnnDestroyFilterDescriptor(U_desc)
libcudnn.cudnnDestroyConvolutionDescriptor(C_desc)

libcudnn.cudnnDestroy(cudnn)