def test_gpu_vector_sum(a, b):
    #define the PyOpenCL Context
    platform = cl.get_platforms()[0]
    device = platform.get_devices()[0]
    context = cl.Context([device])
    queue = cl.CommandQueue(context, \
                            properties=cl.command_queue_properties.PROFILING_ENABLE)   
    #prepare the data structure
    a_buffer = cl.Buffer\
               (context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
    b_buffer = cl.Buffer\
               (context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
    c_buffer = cl.Buffer\
               (context, cl.mem_flags.WRITE_ONLY, b.nbytes)   
    program = cl.Program(context, """
    __kernel void sum(__global const float *a, __global const float *b, __global float *c)
    {
        int i = get_global_id(0);
        int j;
        for(j = 0; j < 10000; j++)
        {
            c[i] = a[i] + b[i];
        }
    }""").build()
    #start the gpu test
    gpu_start_time = time()   
    event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer)   
    event.wait()   
    elapsed = 1e-9*(event.profile.end - event.profile.start)   
    print("GPU Kernel evaluation Time: {0} s".format(elapsed))   
    c_gpu = np.empty_like(a)  
    cl._enqueue_read_buffer(queue, c_buffer, c_gpu).wait()  
    gpu_end_time = time()  
    print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))   
    return c_gpu   
示例#2
0
    def get(self, queue=None, ary=None, async_=None):
        if ary is None:
            ary = np.empty(self.shape, self.dtype)
        else:
            assert ary.size == self.size, "size does not match"
            assert ary.dtype == self.dtype, "type does not match"

        if cla._equal_strides(ary.strides, self.strides, self.shape):
            cl._enqueue_read_buffer(queue or self.queue,
                                    mem=self.base_data,
                                    hostbuf=ary,
                                    device_offset=self.offset,
                                    wait_for=self.events,
                                    is_blocking=not async_)
        else:
            kwargs = get_rect_kwargs(ary, self)
            cl._enqueue_read_buffer_rect(queue or self.queue,
                                         mem=self.base_data,
                                         hostbuf=ary,
                                         is_blocking=not async_,
                                         wait_for=self.events,
                                         **kwargs)
        return ary
示例#3
0
res_buffer = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, wgs * cu * 4)

start_time = int(time.time()) - 1  # -1 to avoid div by 0 on the time check

nodes = 0
calls = 0
active = 1
while active > 0:
    # Create, configure, and execute kernel
    program.solve(queue, (wgs * cu, ), (wgs, ), np.int32(10000), search_buffer,
                  nfound_buffer, lm_placed, lm_used, lm_mindepth, lm_depth,
                  res_buffer, np.int32(calls % 2))
    calls += 1
    #if calls % 10 == 0:
    #print('reading results')
    cl._enqueue_read_buffer(queue, search_buffer, search_data)
    cl._enqueue_read_buffer(queue, res_buffer, res_data)
    cl._enqueue_read_buffer(queue, nfound_buffer, nfound_data).wait()

    nodes2 = sum(map(int, res_data))
    nodes += nodes2

    if calls % 10 == 0:
        active = 0
        for i in range(wgs * cu):
            if search_data[i] >= 0:
                active += 1
                x = [(fit2[x][0] + 1, fit2[x][1])
                     for x in list(search_data[wgs * cu +
                                               width * height * i:wgs * cu +
                                               width * height * (i + 1)])
示例#4
0
solcount = 0
calls = 0
solutions = 0
max_found = 0

start_time = int(time.time()) - 1  # -1 to avoid div by 0 on the time check
nodes = 0
last_time = 0

while True:
    prog.mykernel(queue, (cu * wgs, ), (wgs, ), piece_buffer, worker_buffer,
                  np.int32(len(pos_list)), nassign_buffer, found_buffer,
                  nfound_buffer, np.int32(limit), np.int32(width * height),
                  np.int32(node_limit), lm, res_buffer)
    calls += 1
    cl._enqueue_read_buffer(queue, piece_buffer, piece_data)
    cl._enqueue_read_buffer(queue, worker_buffer, worker_pos)
    cl._enqueue_read_buffer(queue, nassign_buffer, nassign_data)
    #cl._enqueue_read_buffer(queue, found_buffer, found_data)
    cl._enqueue_read_buffer(queue, nfound_buffer, nfound_data)
    cl._enqueue_read_buffer(queue, res_buffer, res_data).wait()
    if nassign_data[0] > len(pos_list):
        nassign_data[0] = len(pos_list)
    last_nodes = 0
    for i in range(wgs * cu):
        last_nodes += int(res_data[i])
    if last_nodes == 0:
        break
    nodes += last_nodes
    if calls % 10 == 0:
        workers_left = 0
}""").build()

# Declaration of buffers for which GPU operations will be performed and passing host variables to them
buffer_in = cl.Buffer(context,
                      cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                      hostbuf=img)
buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, out_gpu.nbytes)
buffer_width = cl.Buffer(context,
                         cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                         hostbuf=np.int32(img_width))
buffer_height = cl.Buffer(context,
                          cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=np.int32(img_height))

# Start of counting time for the GPU
start_gpu = time.time()

# Calling a function that performs operations on the graphics card and writing the value of the output buffer to a host variable
program.calculate_conv(queue, img.shape, None, buffer_in, buffer_out,
                       buffer_width, buffer_height)
cl._enqueue_read_buffer(queue, buffer_out, out_gpu).wait()

# End of counting time for the GPU
end_gpu = time.time()

print("Time of executing operations for GPU : ", end_gpu - start_gpu)
cv2.imwrite('gpu_conv.jpg', out_gpu)
print(
    "The image results for GPU and CPU were generated as cpu_conv.jpg and gpu_conv.jpg. The results should be the same."
)