示例#1
0
    def test_array_gen_cpy(self):
        xp = numpy if self.xp == 'numpy' else cupy
        stream = None if not self.stream else cupy.cuda.Stream()
        width, height, depth = self.dimensions
        n_channel = self.n_channels

        dim = 3 if depth != 0 else 2 if height != 0 else 1
        shape = (depth, height, n_channel*width) if dim == 3 else \
                (height, n_channel*width) if dim == 2 else \
                (n_channel*width,)

        # generate input data and allocate output buffer
        if self.dtype in (numpy.float16, numpy.float32):
            arr = xp.random.random(shape).astype(self.dtype)
            kind = runtime.cudaChannelFormatKindFloat
        else:  # int
            arr = xp.random.randint(100, size=shape, dtype=self.dtype)
            if self.dtype in (numpy.int8, numpy.int16, numpy.int32):
                kind = runtime.cudaChannelFormatKindSigned
            else:
                kind = runtime.cudaChannelFormatKindUnsigned

        if self.c_contiguous:
            arr2 = xp.zeros_like(arr)
            assert arr.flags.c_contiguous
            assert arr2.flags.c_contiguous
        else:
            arr = arr[..., ::2]
            arr2 = xp.zeros_like(arr)
            width = arr.shape[-1] // n_channel
            assert not arr.flags.c_contiguous
            assert arr2.flags.c_contiguous
            assert arr.shape[-1] == n_channel*width

        # create a CUDA array
        ch_bits = [0, 0, 0, 0]
        for i in range(n_channel):
            ch_bits[i] = arr.dtype.itemsize*8
        ch = ChannelFormatDescriptor(*ch_bits, kind)
        cu_arr = CUDAarray(ch, width, height, depth)

        # need to wait for the current stream to finish initialization
        if stream is not None:
            s = cupy.cuda.get_current_stream()
            e = s.record()
            stream.wait_event(e)

        # copy from input to CUDA array, and back to output
        cu_arr.copy_from(arr, stream)
        cu_arr.copy_to(arr2, stream)

        # check input and output are identical
        if stream is not None:
            stream.synchronize()
        assert (arr == arr2).all()
示例#2
0
    def test_write_float_surface(self):
        width, height, depth = self.dimensions
        dim = 3 if depth != 0 else 2 if height != 0 else 1

        # generate input data and allocate output buffer
        shape = (depth, height, width) if dim == 3 else \
                (height, width) if dim == 2 else \
                (width,)

        # prepare input, output, and surface memory
        real_output = cupy.zeros(shape, dtype=cupy.float32)
        assert real_output.flags['C_CONTIGUOUS']
        ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                     runtime.cudaChannelFormatKindFloat)
        expected_output = cupy.arange(numpy.prod(shape), dtype=cupy.float32)
        expected_output = expected_output.reshape(shape) * 3.0
        assert expected_output.flags['C_CONTIGUOUS']

        # create resource descriptor
        # note that surface memory only support CUDA array
        arr = CUDAarray(ch, width, height, depth,
                        runtime.cudaArraySurfaceLoadStore)
        arr.copy_from(real_output)  # init to zero
        res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)

        # create a surface object; currently we don't support surface reference
        surfobj = SurfaceObject(res)
        mod = cupy.RawModule(code=source_surfobj)

        # get and launch the kernel
        ker_name = 'writeKernel'
        ker_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
        ker = mod.get_function(ker_name)
        block = (4, 4, 2) if dim == 3 else (4, 4) if dim == 2 else (4, )
        grid = ()
        args = (surfobj, )
        if dim >= 1:
            grid_x = (width + block[0] - 1) // block[0]
            grid = grid + (grid_x, )
            args = args + (width, )
        if dim >= 2:
            grid_y = (height + block[1] - 1) // block[1]
            grid = grid + (grid_y, )
            args = args + (height, )
        if dim == 3:
            grid_z = (depth + block[2] - 1) // block[2]
            grid = grid + (grid_z, )
            args = args + (depth, )
        ker(grid, block, args)

        # validate result
        arr.copy_to(real_output)
        assert (real_output == expected_output).all()
示例#3
0
    def test_array_gen_cpy(self):
        xp = numpy if self.xp == 'numpy' else cupy
        stream = None if not self.stream else cupy.cuda.Stream()
        width, height, depth = self.dimensions
        n_channel = self.n_channels

        dim = 3 if depth != 0 else 2 if height != 0 else 1
        shape = (depth, height, n_channel*width) if dim == 3 else \
                (height, n_channel*width) if dim == 2 else \
                (n_channel*width,)

        # generate input data and allocate output buffer
        if self.dtype in (numpy.float16, numpy.float32):
            arr = xp.random.random(shape).astype(self.dtype)
            kind = runtime.cudaChannelFormatKindFloat
        else:  # int
            # randint() in NumPy <= 1.10 does not have the dtype argument...
            arr = xp.random.randint(100, size=shape).astype(self.dtype)
            if self.dtype in (numpy.int8, numpy.int16, numpy.int32):
                kind = runtime.cudaChannelFormatKindSigned
            else:
                kind = runtime.cudaChannelFormatKindUnsigned
        arr2 = xp.zeros_like(arr)

        assert arr.flags['C_CONTIGUOUS']
        assert arr2.flags['C_CONTIGUOUS']

        # create a CUDA array
        ch_bits = [0, 0, 0, 0]
        for i in range(n_channel):
            ch_bits[i] = arr.dtype.itemsize * 8
        # unpacking arguments using *ch_bits is not supported before PY35...
        ch = ChannelFormatDescriptor(ch_bits[0], ch_bits[1], ch_bits[2],
                                     ch_bits[3], kind)
        cu_arr = CUDAarray(ch, width, height, depth)

        # copy from input to CUDA array, and back to output
        cu_arr.copy_from(arr, stream)
        cu_arr.copy_to(arr2, stream)

        # check input and output are identical
        if stream is not None:
            dev.synchronize()
        assert (arr == arr2).all()
示例#4
0
    def test_fetch_float_texture(self):
        width, height, depth = self.dimensions
        dim = 3 if depth != 0 else 2 if height != 0 else 1

        if (self.mem_type == 'linear' and dim != 1) or \
           (self.mem_type == 'pitch2D' and dim != 2):
            pytest.skip('The test case {0} is inapplicable for {1} and thus '
                        'skipped.'.format(self.dimensions, self.mem_type))

        # generate input data and allocate output buffer
        shape = (depth, height, width) if dim == 3 else \
                (height, width) if dim == 2 else \
                (width,)

        # prepare input, output, and texture memory
        tex_data = cupy.random.random(shape, dtype=cupy.float32)
        real_output = cupy.zeros_like(tex_data)
        ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                     runtime.cudaChannelFormatKindFloat)
        assert tex_data.flags['C_CONTIGUOUS']
        assert real_output.flags['C_CONTIGUOUS']
        if self.mem_type == 'CUDAarray':
            arr = CUDAarray(ch, width, height, depth)
            expected_output = cupy.zeros_like(tex_data)
            assert expected_output.flags['C_CONTIGUOUS']
            # test bidirectional copy
            arr.copy_from(tex_data)
            arr.copy_to(expected_output)
        else:  # linear are pitch2D are backed by ndarray
            arr = tex_data
            expected_output = tex_data

        # create resource and texture descriptors
        if self.mem_type == 'CUDAarray':
            res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)
        elif self.mem_type == 'linear':
            res = ResourceDescriptor(runtime.cudaResourceTypeLinear,
                                     arr=arr,
                                     chDesc=ch,
                                     sizeInBytes=arr.size * arr.dtype.itemsize)
        else:  # pitch2D
            # In this case, we rely on the fact that the hand-picked array
            # shape meets the alignment requirement. This is CUDA's limitation,
            # see CUDA Runtime API reference guide. "TexturePitchAlignment" is
            # assumed to be 32, which should be applicable for most devices.
            res = ResourceDescriptor(runtime.cudaResourceTypePitch2D,
                                     arr=arr,
                                     chDesc=ch,
                                     width=width,
                                     height=height,
                                     pitchInBytes=width * arr.dtype.itemsize)
        address_mode = (runtime.cudaAddressModeClamp,
                        runtime.cudaAddressModeClamp)
        tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint,
                                runtime.cudaReadModeElementType)

        if self.target == 'object':
            # create a texture object
            texobj = TextureObject(res, tex)
            mod = cupy.RawModule(code=source_texobj)
        else:  # self.target == 'reference'
            mod = cupy.RawModule(code=source_texref)
            texref_name = 'texref'
            texref_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
            texrefPtr = mod.get_texref(texref_name)
            # bind texture ref to resource
            texref = TextureReference(texrefPtr, res, tex)  # noqa

        # get and launch the kernel
        ker_name = 'copyKernel'
        ker_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
        ker_name += 'fetch' if self.mem_type == 'linear' else ''
        ker = mod.get_function(ker_name)
        block = (4, 4, 2) if dim == 3 else (4, 4) if dim == 2 else (4, )
        grid = ()
        args = (real_output, )
        if self.target == 'object':
            args = args + (texobj, )
        if dim >= 1:
            grid_x = (width + block[0] - 1) // block[0]
            grid = grid + (grid_x, )
            args = args + (width, )
        if dim >= 2:
            grid_y = (height + block[1] - 1) // block[1]
            grid = grid + (grid_y, )
            args = args + (height, )
        if dim == 3:
            grid_z = (depth + block[2] - 1) // block[2]
            grid = grid + (grid_z, )
            args = args + (depth, )
        ker(grid, block, args)

        # validate result
        assert (real_output == expected_output).all()