def test_gpu_vector_sum(a, b): #define the PyOpenCL Context platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) queue = cl.CommandQueue(context, \ properties=cl.command_queue_properties.PROFILING_ENABLE) #prepare the data structure a_buffer = cl.Buffer\ (context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) b_buffer = cl.Buffer\ (context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b) c_buffer = cl.Buffer\ (context, cl.mem_flags.WRITE_ONLY, b.nbytes) program = cl.Program(context, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); int j; for(j = 0; j < 10000; j++) { c[i] = a[i] + b[i]; } }""").build() #start the gpu test gpu_start_time = time() event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) event.wait() elapsed = 1e-9*(event.profile.end - event.profile.start) print("GPU Kernel evaluation Time: {0} s".format(elapsed)) c_gpu = np.empty_like(a) cl._enqueue_read_buffer(queue, c_buffer, c_gpu).wait() gpu_end_time = time() print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) return c_gpu
def get(self, queue=None, ary=None, async_=None): if ary is None: ary = np.empty(self.shape, self.dtype) else: assert ary.size == self.size, "size does not match" assert ary.dtype == self.dtype, "type does not match" if cla._equal_strides(ary.strides, self.strides, self.shape): cl._enqueue_read_buffer(queue or self.queue, mem=self.base_data, hostbuf=ary, device_offset=self.offset, wait_for=self.events, is_blocking=not async_) else: kwargs = get_rect_kwargs(ary, self) cl._enqueue_read_buffer_rect(queue or self.queue, mem=self.base_data, hostbuf=ary, is_blocking=not async_, wait_for=self.events, **kwargs) return ary
res_buffer = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, wgs * cu * 4) start_time = int(time.time()) - 1 # -1 to avoid div by 0 on the time check nodes = 0 calls = 0 active = 1 while active > 0: # Create, configure, and execute kernel program.solve(queue, (wgs * cu, ), (wgs, ), np.int32(10000), search_buffer, nfound_buffer, lm_placed, lm_used, lm_mindepth, lm_depth, res_buffer, np.int32(calls % 2)) calls += 1 #if calls % 10 == 0: #print('reading results') cl._enqueue_read_buffer(queue, search_buffer, search_data) cl._enqueue_read_buffer(queue, res_buffer, res_data) cl._enqueue_read_buffer(queue, nfound_buffer, nfound_data).wait() nodes2 = sum(map(int, res_data)) nodes += nodes2 if calls % 10 == 0: active = 0 for i in range(wgs * cu): if search_data[i] >= 0: active += 1 x = [(fit2[x][0] + 1, fit2[x][1]) for x in list(search_data[wgs * cu + width * height * i:wgs * cu + width * height * (i + 1)])
solcount = 0 calls = 0 solutions = 0 max_found = 0 start_time = int(time.time()) - 1 # -1 to avoid div by 0 on the time check nodes = 0 last_time = 0 while True: prog.mykernel(queue, (cu * wgs, ), (wgs, ), piece_buffer, worker_buffer, np.int32(len(pos_list)), nassign_buffer, found_buffer, nfound_buffer, np.int32(limit), np.int32(width * height), np.int32(node_limit), lm, res_buffer) calls += 1 cl._enqueue_read_buffer(queue, piece_buffer, piece_data) cl._enqueue_read_buffer(queue, worker_buffer, worker_pos) cl._enqueue_read_buffer(queue, nassign_buffer, nassign_data) #cl._enqueue_read_buffer(queue, found_buffer, found_data) cl._enqueue_read_buffer(queue, nfound_buffer, nfound_data) cl._enqueue_read_buffer(queue, res_buffer, res_data).wait() if nassign_data[0] > len(pos_list): nassign_data[0] = len(pos_list) last_nodes = 0 for i in range(wgs * cu): last_nodes += int(res_data[i]) if last_nodes == 0: break nodes += last_nodes if calls % 10 == 0: workers_left = 0
}""").build() # Declaration of buffers for which GPU operations will be performed and passing host variables to them buffer_in = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=img) buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, out_gpu.nbytes) buffer_width = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=np.int32(img_width)) buffer_height = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=np.int32(img_height)) # Start of counting time for the GPU start_gpu = time.time() # Calling a function that performs operations on the graphics card and writing the value of the output buffer to a host variable program.calculate_conv(queue, img.shape, None, buffer_in, buffer_out, buffer_width, buffer_height) cl._enqueue_read_buffer(queue, buffer_out, out_gpu).wait() # End of counting time for the GPU end_gpu = time.time() print("Time of executing operations for GPU : ", end_gpu - start_gpu) cv2.imwrite('gpu_conv.jpg', out_gpu) print( "The image results for GPU and CPU were generated as cpu_conv.jpg and gpu_conv.jpg. The results should be the same." )