def aligned_mem(a): temp = drv.aligned_empty(a.shape, dtype=a.dtype, order='C') if len(a.shape) == 1: temp[:] = a else: temp[:, :] = a return temp
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64, alignment=4096) drv.register_host_memory(a)
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20, ), np.float64, alignment=4096) drv.register_host_memory(a)
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20, ), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes == ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format( ) == self.src_format, "invalid source format: %s (expected %s)" % ( image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory( mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width / x_div, 4) out_height = roundup(self.dst_height / y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch( out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0 * (time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w / blockw / xdiv) if gridw * 2 * blockw < w: gridw += 1 gridh = max(1, h / blockh / ydiv) if gridh * 2 * blockh < h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw, blockh, 1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend - kstart) * 1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides) - 1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end - read_start) * 1000.0, 1000.0 * (time.time() - start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes==ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width/x_div, 4) out_height = roundup(self.dst_height/y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w/blockw/xdiv) if gridw*2*blockw<w: gridw += 1 gridh = max(1, h/blockh/ydiv) if gridh*2*blockh<h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides)-1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)