Пример #1
0
    def _pitch_allocate(self, array):
        """ ( alloc, pitch ) = gpu._pitch_allocate( array )
		
		Allocates memory space on the GPU (with pitch) to fit the passed array.
		Returns the gpu memory array and the pitch (the width of the array in bytes) """

        # array shape
        (height, width) = array.shape

        # size of element (in bytes)
        size = array.nbytes / array.size

        return drv.mem_alloc_pitch(width * size, height, size)
Пример #2
0
    def _run_simulation(self, parameters, init_values, blocks, threads):
        total_threads = blocks * threads
        experiments = len(parameters)

        mt_data = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'MersenneTwister.dat')

        # initialize Mersenne Twister
        self._initialise_twisters(mt_data, self._completeCode, threads, blocks)

        param = np.zeros((total_threads / self._beta + 1, self._parameterNumber), dtype=np.float32)
        try:
            for i in range(len(parameters)):
                for j in range(self._parameterNumber):
                    param[i][j] = parameters[i][j]
        except IndexError:
            pass

        # parameter texture
        ary = sim.create_2D_array(param)
        sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4, total_threads / self._beta + 1)
        self._param_tex.set_array(ary)

        # 2D species arrays
        d_x, p_x = cuda.mem_alloc_pitch(width=self._speciesNumber * 4, height=total_threads, access_size=4)

        cuda.memcpy_htod(self._pvxp, np.array([p_x], dtype=np.int32))

        # initialize species
        species_input = np.zeros((total_threads, self._speciesNumber), dtype=np.int32)
        try:
            for i in range(len(init_values)):
                for j in range(self._speciesNumber):
                    species_input[i][j] = init_values[i][j]
        except IndexError:
            pass
        sim.copy2D_host_to_device(d_x, species_input, self._speciesNumber * 4, p_x, self._speciesNumber * 4,
                                  total_threads)

        # output array
        result = np.zeros(total_threads * self._resultNumber * self._speciesNumber, dtype=np.int32)
        d_result = cuda.mem_alloc(result.nbytes)

        # run code
        self._compiledRunMethod(d_x, d_result, block=(threads, 1, 1), grid=(blocks, 1))

        # fetch from GPU memory
        cuda.memcpy_dtoh(result, d_result)
        result = result[0:experiments * self._beta * self._resultNumber * self._speciesNumber]
        result.shape = (experiments, self._beta, self._resultNumber, self._speciesNumber)

        return result
Пример #3
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
Пример #4
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
    def __init__(self, 
        left_binary_block,  # 2d array of boolean
        function_definitions, # array of uint32_t
        right_scalars, # Scalar  values at theright
        column_cardinality
        ):
        self._column_cardinality = column_cardinality
        column_count = left_binary_block.shape[1]
        assert column_count < 32
        self._column_count = column_count
        assert function_definitions.dtype == np.uint32
        function_count = function_definitions.shape[0]
        self._function_count = function_count
        
        ranks = (rankdata(right_scalars) ).astype(np.dtype('f4'))
        gpu_ranks =drv.mem_alloc(ranks.nbytes)
        drv.memcpy_htod(gpu_ranks, ranks)
        self._gpu_ranks = gpu_ranks
   
        # How many rows?
        row_count = left_binary_block.shape[0]
        self._row_count = row_count
        
        # Prepare the left block
        left_binary_encoded = np.zeros((row_count,), dtype=np.uint32)
        for i in range(column_count):
            left_binary_encoded += left_binary_block[:,i] << i
        gpu_left_binary_encoded = drv.mem_alloc(left_binary_encoded.nbytes)
        drv.memcpy_htod(gpu_left_binary_encoded, left_binary_encoded)
        self._gpu_left_binary_encoded =gpu_left_binary_encoded
        
        # Function definitions
        gpu_function_definitions = drv.mem_alloc(function_definitions.nbytes)
        drv.memcpy_htod(gpu_function_definitions, function_definitions)
        self._gpu_function_definitions = gpu_function_definitions
        
        # Space for the results
#         print(row_count, function_count)
        gpu_result_space, gpu_result_pitch = drv.mem_alloc_pitch(row_count, function_count, 4)
        self._gpu_result_space = gpu_result_space
        self._gpu_result_pitch = gpu_result_pitch
        gpu_rho_space = drv.mem_alloc(function_count*8)
        self._gpu_rho_space = gpu_rho_space
        
        self._rho_space = np.zeros((function_count,), dtype='f8')
Пример #6
0
    def __init__(self, backend, dtype, ioshape, initval, iopacking, tags):
        super(CUDAMatrixBase, self).__init__(backend, ioshape, iopacking, tags)

        # Data type info
        self.dtype = dtype
        self.itemsize = np.dtype(dtype).itemsize

        # Dimensions
        nrow, ncol = backend.compact_shape(ioshape, iopacking)
        self.nrow = nrow
        self.ncol = ncol

        # Compute the size, in bytes, of the minor dimension
        colsz = self.ncol*self.itemsize

        if 'align' in tags:
            # Allocate a 2D array aligned to the major dimension
            self.data, self.pitch = cuda.mem_alloc_pitch(colsz, nrow,
                                                         self.itemsize)
            self._nbytes = nrow*self.pitch

            # Ensure that the pitch is a multiple of itemsize
            assert (self.pitch % self.itemsize) == 0
        else:
            # Allocate a standard, tighly packed, array
            self._nbytes = colsz*nrow
            self.data = cuda.mem_alloc(self._nbytes)
            self.pitch = colsz

        self.leaddim = self.pitch / self.itemsize
        self.leadsubdim = self.soa_shape[-1]
        self.traits = (nrow, self.leaddim, self.leadsubdim, self.dtype)

        # Zero the entire matrix (incl. slack)
        assert (self._nbytes % 4) == 0
        cuda.memset_d32(self.data, 0, self._nbytes/4)

        # Process any initial values
        if initval is not None:
            self.set(initval)
Пример #7
0
cuda_device = driver.Device(0)
print("cuda_device=%s" % cuda_device)
cuda_context = cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
try:
    print("cuda_context=%s" % cuda_context)

    BGRA2NV12 = get_BGRA2NV12()
    print("BGRA2NV12=%s" % BGRA2NV12)

    w = roundup(512, 32)
    h = roundup(512, 32)

    log("w=%s, h=%s", w, h)

    cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h*3/2, 16)
    log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch)
    #allocate CUDA NV12 buffer (on device):
    cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h*3/2, 16)
    log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch)

    #host buffers:
    inputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte)
    log("inputBuffer=%s", inputBuffer)

    outputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte)
    log("outputBuffer=%s", outputBuffer)

    #populate host buffer with random data:
    buf = inputBuffer.data
    for y in range(h*3/2):
Пример #8
0
    def __init__(self, shape, dtype, gpudata=None, pitch = None):
        """create a PitchArray
        shape: shape of the array
        dtype: dtype of the array
        gpudata: DeviceAllocation object indicating the device memory allocated
        pitch: if gpudata is specified and pitch is True, gpudata will be treated
                as if it was allocated by cudaMallocPitch with pitch

        attributes:
        .shape: shape of self
        .size:  number of elements of the array
        .mem_size: number of elements of total memory allocated
        .ld: leading dimension
        .M: 1 if self is a vector, shape[0] otherwise
        .N: self.size if self is a vector, product of shape[1] and shape[2] otherwise
        .gpudata: DeviceAllocation
        .ndim: number of dimensions
        .dtype: dtype of array
        self.nbytes: total memory allocated for the array in bytes
        
        Note:
        any 1-dim shape will result in a row vector with new shape as (1, shape)

        operations of PitchArray is elementwise operation

        """
    
        try:
            tmpshape = []
            s = 1
            for dim in shape:
                dim = int(dim)
                assert isinstance(dim, int)
                s *= dim
                tmpshape.append(dim)
                
            self.shape = tuple(tmpshape)
        except TypeError:
            s = int(shape)
            assert isinstance(s, int)
            if s:
                self.shape = (1, s)
            else:
                self.shape = (0, 0)
            
        self.ndim = len(self.shape)
        
        if self.ndim > 3:
            raise ValueError("Only support array of dimension leq 3")
        
        self.dtype = np.dtype(dtype)
        
        self.size = s
        
        
        if gpudata is None:
            if self.size:
                if _pd(self.shape) == 1 or self.shape[0] == 1:
                    self.gpudata = cuda.mem_alloc(self.size * self.dtype.itemsize)
                    self.mem_size = self.size
                    self.ld = _pd(self.shape)
                    self.M = 1
                    self.N = self.size
                    
                else:
                    self.gpudata, pitch = cuda.mem_alloc_pitch(int(_pd(self.shape) * np.dtype(dtype).itemsize), self.shape[0], np.dtype(dtype).itemsize)
                    self.ld = pitch / np.dtype(dtype).itemsize
                    self.mem_size = self.ld * self.shape[0]
                    self.M = self.shape[0]
                    self.N = _pd(self.shape)
            
            else:
                self.gpudata = None
                self.M = 0
                self.N = 0
                self.ld = 0
                self.mem_size = 0
                
        else:
            #assumed that the device memory was also allocated by mem_alloc_pitch is required by the shape
            assert gpudata.__class__ == cuda.DeviceAllocation
            
            if self.size:
                self.gpudata = gpudata
                if _pd(self.shape) == 1 or self.shape[0] == 1:
                    self.mem_size = self.size
                    self.ld = _pd(self.shape)
                    self.M = 1
                    self.N = self.size
                else:
                    if pitch is None:
                        pitch = int(np.ceil(float(_pd(self.shape) * np.dtype(dtype).itemsize) / 512) * 512)
                    else:
                        assert pitch == int(np.ceil(float(_pd(self.shape) * np.dtype(dtype).itemsize) / 512) * 512)
                    
                    self.ld = pitch / np.dtype(dtype).itemsize
                    self.mem_size = self.ld * self.shape[0]
                    self.M = self.shape[0]
                    self.N = _pd(self.shape)
                        
            else:
                self.gpudata = None
                self.M = 0
                self.N = 0
                self.ld = 0
                self.mem_size = 0
                print "warning: shape may not be assigned properly"
        self.nbytes = self.dtype.itemsize * self.mem_size
        self._grid, self._block = splay(self.mem_size, self.M)
Пример #9
0
'''

#template = string.Template(template)
module = SourceModule(template)
func = module.get_function('convolutionRowGPU')

original = numpy.random.rand(2, 7) * 255
original = numpy.float32(original)

print original
'''
destImage_gpu = cuda.mem_alloc_like(original)
sourceImage_gpu = cuda.mem_alloc_like(original)
intermediateImage_gpu = cuda.mem_alloc_like(original)
'''
destImage_gpu, pit = cuda.mem_alloc_pitch(7 * 4, 2,
                                          numpy.dtype(numpy.float32).itemsize)
sourceImage_gpu, pit2 = cuda.mem_alloc_pitch(
    7 * 4, 2,
    numpy.dtype(numpy.float32).itemsize)
print pit, pit2

#cuda.memcpy_htod(sourceImage_gpu, original)
#cuda.memcpy_htod(destImage_gpu, original)
copy = cuda.Memcpy2D()
copy.set_src_host(original)
copy.set_dst_device(destImage_gpu)
copy.height = 2
copy.width_in_bytes = 7 * 4
copy.src_pitch = 7 * 4
copy.dst_pitch = 128 * 4
copy(aligned=True)
Пример #10
0
cuda_device = driver.Device(0)
print("cuda_device=%s" % cuda_device)
cuda_context = cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO
                                        | driver.ctx_flags.MAP_HOST)
try:
    print("cuda_context=%s" % cuda_context)

    BGRA2NV12 = get_CUDA_function(0, "BGRA_to_NV12")
    print("BGRA2NV12=%s" % BGRA2NV12)

    w = roundup(512, 32)
    h = roundup(512, 32)

    log("w=%s, h=%s", w, h)

    cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16)
    log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)),
        inputPitch)
    #allocate CUDA NV12 buffer (on device):
    cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16)
    log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch)

    #host buffers:
    inputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2,
                                          dtype=numpy.byte)
    log("inputBuffer=%s", inputBuffer)

    outputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2,
                                           dtype=numpy.byte)
    log("outputBuffer=%s", outputBuffer)
Пример #11
0
    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image,
              iplanes, type(pixels), len(pixels))
        assert iplanes == ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format(
        ) == self.src_format, "invalid source format: %s (expected %s)" % (
            image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(
            mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width / x_div, 4)
            out_height = roundup(self.dst_height / y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(
                out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms",
              1000.0 * (time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w / blockw / xdiv)
        if gridw * 2 * blockw < w:
            gridw += 1
        gridh = max(1, h / blockh / ydiv)
        if gridh * 2 * blockh < h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s",
              self.kernel_function_name, tuple(kargs), (gridw, gridh),
              (blockw, blockh, 1))
        self.kernel_function(*kargs,
                             block=(blockw, blockh, 1),
                             grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name,
              (kend - kstart) * 1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides) - 1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f",
              (read_end - read_start) * 1000.0, 1000.0 * (time.time() - start))
        return ImageWrapper(0,
                            0,
                            self.dst_width,
                            self.dst_height,
                            pixels,
                            self.dst_format,
                            24,
                            strides,
                            planes=ImageWrapper._3_PLANES)
Пример #12
0
    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
        assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width/x_div, 4)
            out_height = roundup(self.dst_height/y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w/blockw/xdiv)
        if gridw*2*blockw<w:
            gridw += 1
        gridh = max(1, h/blockh/ydiv)
        if gridh*2*blockh<h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1))
        self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides)-1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)