def main():
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    with open(PROGRAM_FILE) as prg_file:
        prg = cl.Program(ctx, prg_file.read()).build()

    targets_buf = cl.Buffer(ctx,
                            cl.mem_flags.READ_ONLY
                            | cl.mem_flags.COPY_HOST_PTR,
                            hostbuf=TARGETS_NP)

    results_np = np.zeros(np.shape(TARGETS_NP)[0], dtype=np.uint32)
    results_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, results_np.nbytes)
    cl.enqueue_fill_buffer(
        queue, results_buf, np.uint32(0), 0,
        np.dtype(np.uint32).itemsize * np.shape(results_np)[0], None)
    success_np = np.zeros(np.shape(TARGETS_NP)[0], dtype=np.uint32)
    success_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, success_np.nbytes)
    cl.enqueue_fill_buffer(
        queue, success_buf, np.uint32(0), 0,
        np.dtype(np.uint32).itemsize * np.shape(success_np)[0], None)
    #prg.ipv4_hash(queue, (WORKITEM_NB,), (WORKITEM_PER_GROUP,), targets_buf, success_buf, results_buf)
    prg.ipv4_hash(queue, (WORKITEM_TOTAL // WORKITEM_ITER, ), None,
                  np.uint32(WORKITEM_ITER), targets_buf, success_buf,
                  results_buf)

    cl.enqueue_copy(queue, results_np, results_buf)

    cl.enqueue_copy(queue, success_np, success_buf)
    #queue.finish()

    print(success_np)
    print(results_np)
Exemplo n.º 2
0
    def evaluator(coeffs):
        """Actually evaluate the near-field correction."""

        result = _np.empty(4 * ncoeffs, dtype=result_type)
        with bempp.api.Timer(message="Singular Corrections Evaluator"):
            with _cl.CommandQueue(ctx, device=device) as queue:
                _cl.enqueue_copy(queue, coefficients_buffer,
                                 coeffs.astype(result_type))
                _cl.enqueue_fill_buffer(
                    queue,
                    result_buffer,
                    _np.uint8(0),
                    0,
                    result_type.itemsize * ncoeffs,
                )
                kernel(
                    queue,
                    (grid.number_of_elements, ),
                    (1, ),
                    grid_buffer,
                    neighbor_indices_buffer,
                    neighbor_indexptr_buffer,
                    points_buffer,
                    coefficients_buffer,
                    result_buffer,
                    kernel_parameters_buffer,
                    _np.uint32(grid.number_of_elements),
                )
                _cl.enqueue_copy(queue, result, result_buffer)

        return result
    def execute(self, q, repeat=1, unbind=True):
        for r in range(repeat):
            cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4)
            call_cl_kernel(self.kernel, q, *self.launch_args)

        if unbind:
            self.zero_args = self.convert_args = None
            self.launch_args[2:7] = (None,) * 5
Exemplo n.º 4
0
    def evaluator(x):
        """Evaluate a potential."""
        result = _np.empty(kernel_dimension * npoints, dtype=result_type)
        with _cl.CommandQueue(ctx, device=device) as queue:
            _cl.enqueue_copy(queue, coefficients_buffer, x.astype(result_type))
            _cl.enqueue_fill_buffer(
                queue,
                result_buffer,
                _np.uint8(0),
                0,
                kernel_dimension * npoints * result_type.itemsize,
            )
            if main_size > 0:
                _cl.enqueue_fill_buffer(queue, sum_buffer, _np.uint8(0), 0,
                                        sum_size)
                main_kernel(
                    queue,
                    (npoints, main_size // vector_width),
                    (1, WORKGROUP_SIZE_POTENTIAL // vector_width),
                    grid_buffer,
                    indices_buffer,
                    normals_buffer,
                    points_buffer,
                    coefficients_buffer,
                    quad_points_buffer,
                    quad_weights_buffer,
                    sum_buffer,
                    kernel_options_buffer,
                )

                sum_kernel(
                    queue,
                    (kernel_dimension * npoints, ),
                    (1, ),
                    sum_buffer,
                    result_buffer,
                    _np.uint32(nelements // WORKGROUP_SIZE_POTENTIAL),
                )

            if remainder_size > 0:
                remainder_kernel(
                    queue,
                    (npoints, remainder_size),
                    (1, remainder_size),
                    grid_buffer,
                    indices_buffer,
                    normals_buffer,
                    points_buffer,
                    coefficients_buffer,
                    quad_points_buffer,
                    quad_weights_buffer,
                    result_buffer,
                    kernel_options_buffer,
                    global_offset=(0, main_size),
                )

            _cl.enqueue_copy(queue, result, result_buffer)
        return result
Exemplo n.º 5
0
    def execute(self, q, repeat=1, unbind=True):
        for r in range(repeat):
            cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0,
                                   self.zero_args[2] * 4)
            call_cl_kernel(self.kernel, q, *self.launch_args)

        if unbind:
            self.zero_args = self.convert_args = None
            self.launch_args[2:7] = (None, ) * 5
Exemplo n.º 6
0
    def _init_normal_memory(self):

        mem_size = SmithWatermanOcl._init_normal_memory(self)

        # Input matrix device memory
        memory = (SmithWaterman.float_size * self.length_of_x_sequences *
                  self.number_of_sequences * self.length_of_y_sequences *
                  self.number_targets)
        if self._need_reallocation(self.d_matrix, memory):
            self.d_matrix = cl.Buffer(self.ctx,
                                      cl.mem_flags.READ_WRITE,
                                      size=memory)
        mem_size += memory
        if self.gap_extension:
            if self._need_reallocation(self.d_matrix_i, memory):
                self.d_matrix_i = cl.Buffer(self.ctx,
                                            cl.mem_flags.READ_WRITE,
                                            size=memory)
            mem_size += memory
            if self._need_reallocation(self.d_matrix_j, memory):
                self.d_matrix_j = cl.Buffer(self.ctx,
                                            cl.mem_flags.READ_WRITE,
                                            size=memory)
            mem_size += memory

        # Maximum global device memory
        memory = (SmithWaterman.float_size * self.x_div_shared_x *
                  self.number_of_sequences * self.y_div_shared_y *
                  self.number_targets)
        if self._need_reallocation(self.d_global_maxima, memory):
            self.d_global_maxima = cl.Buffer(self.ctx,
                                             cl.mem_flags.READ_WRITE,
                                             size=memory)
        mem_size += memory

        memory = (self.length_of_x_sequences * self.number_of_sequences *
                  self.length_of_y_sequences * self.number_targets)
        if self._need_reallocation(self.d_global_direction, memory):
            self.d_global_direction = cl.Buffer(self.ctx,
                                                cl.mem_flags.READ_WRITE,
                                                size=memory)
        mem_size += memory

        memory = SmithWaterman.int_size
        if self._need_reallocation(self.d_is_traceback_required, memory):
            self.d_is_traceback_required = cl.Buffer(self.ctx,
                                                     cl.mem_flags.WRITE_ONLY,
                                                     size=memory)
            flag = numpy.zeros((1), dtype=numpy.uint32)
            cl.enqueue_fill_buffer(self.queue,
                                   self.d_is_traceback_required,
                                   flag,
                                   0,
                                   size=memory)

        return mem_size
Exemplo n.º 7
0
def sync(self):
    failure = np.empty(1, dtype=np.int32)
    cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True)
    self.failure_is_an_option = np.int32(0)
    if failure[0] >= 0:
        # Reset failure information.
        cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize)

        # Read failure args.
        failure_args = np.empty(self.global_failure_args_max+1, dtype=np.int32)
        cl.enqueue_copy(self.queue, failure_args, self.global_failure_args, is_blocking=True)

        raise Exception(self.failure_msgs[failure[0]].format(*failure_args))
Exemplo n.º 8
0
def core_search(partial_outputs, a):
        search_len_np = np.array(args.max_skip, dtype=np.uint32)
        result_count_cl = cl.Buffer(ctx, mem_flags.READ_WRITE, uint32_size)
        result_cl = cl.Buffer(ctx, mem_flags.WRITE_ONLY, max_results*uint32_size) 
        cl.enqueue_fill_buffer(queue, result_count_cl, "\x00", 0, uint32_size)
        result_max_np = np.array(max_results, dtype=np.uint32)
        a_np = np.array(a, dtype=np.uint32)
        outputs_np = np.array(partial_outputs, dtype=np.uint32)
        outputs_cl = cl.Buffer(ctx, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=outputs_np) 
        outputs_len_np = np.array(len(outputs_np), dtype=np.uint32)

        program.node_newer_rng(queue, [1<<16], None, search_len_np, result_max_np, a_np, outputs_cl, outputs_len_np, result_count_cl, result_cl)

        return result_count_cl, result_cl
Exemplo n.º 9
0
    def memset(self, buffer, value, size):
        """set the memory in allocation to the value in value

        :param allocation: An OpenCL Buffer to fill
        :type allocation: pyopencl.Buffer

        :param value: The value to set the memory to
        :type value: a single 32-bit int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        if isinstance(buffer, cl.Buffer):
            cl.enqueue_fill_buffer(self.queue, buffer, numpy.uint32(value), 0, size)
Exemplo n.º 10
0
def test_scatter(cl_env, value_dtype, index_dtype):
    ctx, cq = cl_env

    size = 240
    nindices = 30
    indexer = Indexer(ctx, value_dtype, index_dtype)
    values = (np.random.uniform(0, 1000, (nindices,) + value_dtype.shape)
              .astype(value_dtype.base))
    indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype)

    values_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values
    )
    values_out_buf = cl.Buffer(
        ctx, cl.mem_flags.WRITE_ONLY, size * value_dtype.itemsize
    )
    index_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices
    )

    e = cl.enqueue_fill_buffer(
        cq, values_out_buf, np.full(1, 1.0, value_dtype), 0, size * value_dtype.itemsize
    )
    e = indexer.scatter(cq, nindices, values_buf, index_buf, values_out_buf, wait_for=[e])
    (values_map, _) = cl.enqueue_map_buffer(
        cq, values_out_buf, cl.map_flags.READ,
        0, (size,) + value_dtype.shape, value_dtype.base,
        wait_for=[e], is_blocking=True
    )

    selection = np.zeros(size, dtype='bool')
    selection[indices] = True
    np.testing.assert_equal(values_map[indices], values)
    np.testing.assert_equal(values_map[~selection], 1.0)
Exemplo n.º 11
0
 def run( self, idx, runev ):
     fillev = cl.enqueue_fill_buffer( self.queue_, self.retCnt_[ idx ],
                                      np.uint8(0), 0, 4 )
     runev = cl.enqueue_nd_range_kernel( self.queue_, self.kernel_[ idx ],
                                        (self.threadCount_,), None, wait_for=[fillev, runev] )
     self.queue_.flush()
     return runev
Exemplo n.º 12
0
 def fill(self, value):
     if not isinstance(value, np.ndarray):
         value = np.array(value)
     if len(value.shape) == 0:
         value = value.astype(self.gtype.dtype)
         cl.enqueue_fill_buffer(queue, self.buffer, value, 0,
                                self.gtype.elemsize * prod(self.shape))
     elif value.dtype != self.gtype.dtype:
         raise ValueError(
             "Matrix fill error: the given host buffer is not in the same data type with the matrix"
         )
     elif value.shape != self.shape:
         raise ValueError(
             "Matrix fill error: the given host buffer is not in the same shape with the matrix"
         )
     else:
         cl.enqueue_copy(queue, self.buffer, value)
Exemplo n.º 13
0
    def pol3d_to_slice2d(self, P1, P2):
        S = np.zeros((self.row_len, self.col_len), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_P1, P1)
        cl.enqueue_copy(self.queue, self.buf_P2, P2)
        cl.enqueue_fill_buffer(self.queue, self.buf_S, np.float64(0.0), 0,
                               S.nbytes)

        self.prg.pol3d_to_slice2d(self.queue, (self.row_len, self.col_len),
                                  None, self.buf_fmt, self.buf_P1, self.buf_P2,
                                  self.buf_S)

        cl.enqueue_copy(self.queue, S, self.buf_S)

        cl.enqueue_barrier(self.queue).wait()

        return S
Exemplo n.º 14
0
    def pol3d_to_cart2d(self, P1, P2):
        M = np.zeros((self.row_len, self.col_len), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_P1, P1)
        cl.enqueue_copy(self.queue, self.buf_P2, P2)
        cl.enqueue_fill_buffer(self.queue, self.buf_M, np.float64(0.0), 0,
                               M.nbytes)

        self.prg.pol3d_to_cart2d(self.queue, (self.row_len, self.col_len),
                                 None, self.buf_fmt, self.buf_P1, self.buf_P2,
                                 self.buf_M)

        cl.enqueue_copy(self.queue, M, self.buf_M)

        cl.enqueue_barrier(self.queue).wait()

        return M
Exemplo n.º 15
0
 def _is_traceback_required(self):
     '''Returns False if it is known after calculating scores that there are no possible
     starting points, hence no need to run traceback.
     '''
     flag = numpy.zeros((1), dtype=numpy.uint32)
     cl.enqueue_copy(self.queue, flag, self.d_is_traceback_required)
     if flag[0]:
         # Clear the flag
         flag[0] = 0
         cl.enqueue_fill_buffer(self.queue,
                                self.d_is_traceback_required,
                                flag,
                                0,
                                size=SmithWaterman.int_size)
         return True
     else:
         return False
Exemplo n.º 16
0
    def fillBuf(self, bufname, val, wait_for=None):
        self.log("fillBuf " + bufname)

        buf = self.bufs[bufname]
        buftype = np.dtype(self.buf_spec[bufname][0]).type

        self.logevt('fill_buffer',
            cl.enqueue_fill_buffer(self.queue, buf, buftype(val), 0, buf.size,
                                   wait_for=self._waitevt()))
Exemplo n.º 17
0
def core_search(partial_outputs, a):
    search_len_np = np.array(args.max_skip, dtype=np.uint32)
    result_count_cl = cl.Buffer(ctx, mem_flags.READ_WRITE, uint32_size)
    result_cl = cl.Buffer(ctx, mem_flags.WRITE_ONLY, max_results * uint32_size)
    cl.enqueue_fill_buffer(queue, result_count_cl, "\x00", 0, uint32_size)
    result_max_np = np.array(max_results, dtype=np.uint32)
    a_np = np.array(a, dtype=np.uint32)
    outputs_np = np.array(partial_outputs, dtype=np.uint32)
    outputs_cl = cl.Buffer(ctx,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=outputs_np)
    outputs_len_np = np.array(len(outputs_np), dtype=np.uint32)

    program.node_newer_rng(queue, [1 << 16], None, search_len_np,
                           result_max_np, a_np, outputs_cl, outputs_len_np,
                           result_count_cl, result_cl)

    return result_count_cl, result_cl
Exemplo n.º 18
0
    def Solve(self, t, dt):

        cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0,
                               self.LM.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0,
                               self.LM.nbytes)

        update_u_event = cl.enqueue_copy(self.queue, self.u_buf, self.srcU)
        # update_appTrac_event = cl.enqueue_copy(self.queue, self.appTrac_buf, self.appTrac)

        calc_Ku_events = [update_u_event]
        for iColorGrp in range(len(self.colorGps_buf)):
            calc_Ku_event = \
            self.program.assemble_K_P(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                                      np.int64(len(self.mesh.colorGroups[iColorGrp])),
                                      np.int64(self.nNodes), np.int64(self.nSmp), np.float64(self.pressure),
                                      self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGrp],
                                      self.elmTE_buf[iColorGrp], self.u_buf, self.Ku_buf, self.P_buf,
                                      wait_for=calc_Ku_events)
            calc_Ku_events = [calc_Ku_event]

        calc_u_event = \
        self.program.calc_u(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                            np.int64(self.nSmp), np.int64(self.ndof), np.float64(dt),
                            self.P_buf, self.Ku_buf, self.LM_buf, self.LHS_buf,
                            self.u_buf, self.up_buf, self.ures_buf, wait_for=[calc_Ku_event])

        ures_copy_event = cl.enqueue_copy(self.queue,
                                          self.srcURes,
                                          self.ures_buf,
                                          wait_for=[calc_u_event])
        ures_copy_event.wait()

        # Synchronize the ures.
        self.SyncCommNodes(self.srcURes)
        # Add on the global force.
        self.srcURes[:, :] += dt * dt * self.appTrac.reshape(self.ndof,
                                                             1) / self.LHS
        # Apply boundary condition.
        self.ApplyBoundaryCondition(self.srcURes)

        # Update/Shift the pointers.
        self.srcURes, self.srcU, self.srcUP = self.srcUP, self.srcURes, self.srcU
        self.ures_buf, self.u_buf, self.up_buf = self.up_buf, self.ures_buf, self.u_buf
Exemplo n.º 19
0
    def memset(self, buffer, value, size):
        """set the memory in allocation to the value in value

        :param allocation: An OpenCL Buffer to fill
        :type allocation: pyopencl.Buffer

        :param value: The value to set the memory to
        :type value: a single 32-bit int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        if isinstance(buffer, cl.Buffer):
            try:
                cl.enqueue_fill_buffer(self.queue, buffer, np.uint32(value), 0, size)
            except AttributeError:
                src=np.zeros(size, dtype='uint8')+np.uint8(value)
                cl.enqueue_copy(self.queue, buffer, src)
Exemplo n.º 20
0
    def memset(self, buffer, value, size):
        """set the memory in allocation to the value in value

        :param allocation: An OpenCL Buffer to fill
        :type allocation: pyopencl.Buffer

        :param value: The value to set the memory to
        :type value: a single 32-bit int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        if isinstance(buffer, cl.Buffer):
            try:
                cl.enqueue_fill_buffer(self.queue, buffer, numpy.uint32(value), 0, size)
            except AttributeError:
                src=numpy.zeros(size, dtype='uint8')+numpy.uint8(value)
                cl.enqueue_copy(self.queue, buffer, src)
Exemplo n.º 21
0
    def cart2d_to_pol2d(self, M):
        Q1 = np.zeros((len(self.R), ), dtype=np.float64)
        Q2 = np.zeros((len(self.R), len(self.A)), dtype=np.float64)
        norm = np.zeros((len(self.R), ), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_M, M)
        cl.enqueue_fill_buffer(self.queue, self.buf_Q1e, np.float64(0.0), 0,
                               Q1.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.buf_Q2e, np.float64(0.0), 0,
                               Q2.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.buf_norm, np.float64(0.0), 0,
                               norm.nbytes)

        self.prg.cart2d_to_pol2d_project(self.queue, (self.r_len, 1), None,
                                         self.buf_fmt, self.buf_M,
                                         self.buf_Q1e, self.buf_Q2e,
                                         self.buf_norm)

        cl.enqueue_copy(self.queue, Q1, self.buf_Q1e)
        cl.enqueue_copy(self.queue, Q2, self.buf_Q2e)
        cl.enqueue_copy(self.queue, norm, self.buf_norm)

        cl.enqueue_barrier(self.queue).wait()

        Q1 /= norm.sum()

        return Q1, Q2
Exemplo n.º 22
0
    def calculate_deviations(self, xsiz, zsiz, pix2nm, dt, u, g, b, c2d, d2c,
                             nu, phi, psi, theta):
        #fills up sxz array
        #set up buffers to pass between python and C++
        buf_size = xsiz * (zsiz + 1) * 4

        mf = cl.mem_flags
        self.sxz_buf = cl.Buffer(self.ctx, mf.READ_WRITE, buf_size)
        out_buf_2 = cl.Buffer(self.ctx, mf.READ_WRITE, buf_size)
        c2d_buf = cl.Buffer(self.ctx, mf.READ_ONLY, c2d.size * 4)
        d2c_buf = cl.Buffer(self.ctx, mf.READ_ONLY, d2c.size * 4)

        shape = np.array([(zsiz + 1), xsiz], dtype=np.int32)

        # set up contiguous buffers
        c2d_ = np.ascontiguousarray(c2d.ravel().astype(np.float32))
        cl.enqueue_copy(self.queue, c2d_buf, c2d_)
        d2c_ = np.ascontiguousarray(d2c.ravel().astype(np.float32))
        cl.enqueue_copy(self.queue, d2c_buf, d2c_)
        # fill with zeros as we add to initial buffer (makes handling the 2 bs easier)
        cl.enqueue_fill_buffer(self.queue, self.sxz_buf, np.float32(0.0), 0,
                               buf_size)
        cl.enqueue_fill_buffer(self.queue, out_buf_2, np.float32(0.0), 0,
                               buf_size)

        # the actual calculation
        #small change in z-coordinate to get derivative
        dz = 0.0
        #calculate x-z array of displacements
        self.displace_r(shape, self.sxz_buf, c2d_buf, d2c_buf, pix2nm, u, g, b,
                        c2d, nu, phi, psi, theta, dt, dz)
        # calculate second array at a small z-shift
        dz = 0.01
        self.displace_r(shape, out_buf_2, c2d_buf, d2c_buf, pix2nm, u, g, b,
                        c2d, nu, phi, psi, theta, dt, dz)

        # subtract one from the other to get the gradient
        dz_32 = np.float32(dz * dt)
        self.disp_r_prog.difference(self.queue, shape, None, self.sxz_buf,
                                    out_buf_2, dz_32)
Exemplo n.º 23
0
 def find_offsets(self,
                  cq,
                  values_buf,
                  n_values,
                  offsets_buf,
                  n_offsets,
                  wait_for=None):
     wait_for = wait_for or []
     fill_offsets = cl.enqueue_fill_buffer(
         cq,
         offsets_buf,
         array(n_values, dtype=self.program.offset_dtype),
         0,
         n_offsets * self.program.offset_dtype.itemsize,
         wait_for=wait_for)
     return self.program.kernels['find_offsets'](
         cq,
         (n_values - 1, ),
         None,
         values_buf,
         offsets_buf,
         wait_for=[fill_offsets],
     )
Exemplo n.º 24
0
 def forward(ctx, t, val):
     val = t.dtype.type(val)
     cl.enqueue_fill_buffer(t.device.queue, t.data, val,
                            t.offset * t.dtype.itemsize,
                            t.numel() * t.dtype.itemsize)
     return t
Exemplo n.º 25
0
 def enqueue_zero_buffer(self, buffer: pyopencl.array.Array) -> None:
     pyopencl.enqueue_fill_buffer(self._pyopencl_command_queue, buffer.data,
                                  np.uint8(0), 0, buffer.data.get_info(pyopencl.mem_info.SIZE))
Exemplo n.º 26
0
def dense_assembler(device_interface, operator_descriptor, domain,
                    dual_to_range, parameters, result):
    """Assemble dense with OpenCL."""
    import bempp.api
    from bempp.api.integration.triangle_gauss import rule
    from bempp.api.utils.helpers import get_type
    from bempp.core.opencl_kernels import get_kernel_from_operator_descriptor
    from bempp.core.opencl_kernels import (
        default_context,
        default_device,
        get_vector_width,
    )

    if bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "gpu":
        device_type = "gpu"
    elif bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "cpu":
        device_type = "cpu"
    else:
        raise RuntimeError(
            f"Unknown device type {bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE}")

    mf = _cl.mem_flags
    ctx = default_context(device_type)
    device = default_device(device_type)

    precision = operator_descriptor.precision
    dtype = get_type(precision).real
    kernel_options = operator_descriptor.options

    quad_points, quad_weights = rule(parameters.quadrature.regular)

    test_indices, test_color_indexptr = dual_to_range.get_elements_by_color()
    trial_indices, trial_color_indexptr = domain.get_elements_by_color()

    number_of_test_colors = len(test_color_indexptr) - 1
    number_of_trial_colors = len(trial_color_indexptr) - 1

    options = {
        "NUMBER_OF_QUAD_POINTS": len(quad_weights),
        "TEST": dual_to_range.shapeset.identifier,
        "TRIAL": domain.shapeset.identifier,
        "TRIAL_NUMBER_OF_ELEMENTS": domain.number_of_support_elements,
        "TEST_NUMBER_OF_ELEMENTS": dual_to_range.number_of_support_elements,
        "NUMBER_OF_TEST_SHAPE_FUNCTIONS":
        dual_to_range.number_of_shape_functions,
        "NUMBER_OF_TRIAL_SHAPE_FUNCTIONS": domain.number_of_shape_functions,
    }

    if operator_descriptor.is_complex:
        options["COMPLEX_KERNEL"] = None

    main_kernel = get_kernel_from_operator_descriptor(operator_descriptor,
                                                      options,
                                                      "regular",
                                                      device_type=device_type)
    remainder_kernel = get_kernel_from_operator_descriptor(
        operator_descriptor,
        options,
        "regular",
        force_novec=True,
        device_type=device_type,
    )

    test_indices_buffer = _cl.Buffer(ctx,
                                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                                     hostbuf=test_indices)
    trial_indices_buffer = _cl.Buffer(ctx,
                                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                                      hostbuf=trial_indices)

    test_normals_buffer = _cl.Buffer(ctx,
                                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                                     hostbuf=dual_to_range.normal_multipliers)
    trial_normals_buffer = _cl.Buffer(ctx,
                                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                                      hostbuf=domain.normal_multipliers)
    test_grid_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=dual_to_range.grid.as_array.astype(dtype),
    )
    trial_grid_buffer = _cl.Buffer(ctx,
                                   mf.READ_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=domain.grid.as_array.astype(dtype))

    test_elements_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=dual_to_range.grid.elements.ravel(order="F"),
    )

    trial_elements_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=domain.grid.elements.ravel(order="F"),
    )

    test_local2global_buffer = _cl.Buffer(ctx,
                                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                                          hostbuf=dual_to_range.local2global)

    trial_local2global_buffer = _cl.Buffer(ctx,
                                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                                           hostbuf=domain.local2global)

    test_multipliers_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=dual_to_range.local_multipliers.astype(dtype),
    )

    trial_multipliers_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=domain.local_multipliers.astype(dtype),
    )

    quad_points_buffer = _cl.Buffer(
        ctx,
        mf.READ_ONLY | mf.COPY_HOST_PTR,
        hostbuf=quad_points.ravel(order="F").astype(dtype),
    )

    quad_weights_buffer = _cl.Buffer(ctx,
                                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                                     hostbuf=quad_weights.astype(dtype))

    result_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=result.nbytes)

    if not kernel_options:
        kernel_options = [0.0]

    kernel_options_array = _np.array(kernel_options, dtype=dtype)

    kernel_options_buffer = _cl.Buffer(ctx,
                                       mf.READ_ONLY | mf.COPY_HOST_PTR,
                                       hostbuf=kernel_options_array)

    vector_width = get_vector_width(precision, device_type=device_type)

    def kernel_runner(
        queue,
        test_offset,
        trial_offset,
        test_number_of_indices,
        trial_number_of_indices,
    ):
        """Actually run the kernel for a given range."""
        remainder_size = trial_number_of_indices % vector_width
        main_size = trial_number_of_indices - remainder_size

        buffers = [
            test_indices_buffer,
            trial_indices_buffer,
            test_normals_buffer,
            trial_normals_buffer,
            test_grid_buffer,
            trial_grid_buffer,
            test_elements_buffer,
            trial_elements_buffer,
            test_local2global_buffer,
            trial_local2global_buffer,
            test_multipliers_buffer,
            trial_multipliers_buffer,
            quad_points_buffer,
            quad_weights_buffer,
            result_buffer,
            kernel_options_buffer,
            _np.int32(dual_to_range.global_dof_count),
            _np.int32(domain.global_dof_count),
            _np.uint8(domain.grid != dual_to_range.grid),
        ]

        if main_size > 0:
            main_kernel(
                queue,
                (test_number_of_indices, main_size // vector_width),
                (1, 1),
                *buffers,
                global_offset=(test_offset, trial_offset),
            )

        if remainder_size > 0:
            remainder_kernel(
                queue,
                (test_number_of_indices, remainder_size),
                (1, 1),
                *buffers,
                global_offset=(test_offset, trial_offset + main_size),
            )

    with _cl.CommandQueue(ctx, device=device) as queue:
        _cl.enqueue_fill_buffer(queue, result_buffer, _np.uint8(0), 0,
                                result.nbytes)
        for test_index in range(number_of_test_colors):
            test_offset = test_color_indexptr[test_index]
            n_test_indices = (test_color_indexptr[1 + test_index] -
                              test_color_indexptr[test_index])
            for trial_index in range(number_of_trial_colors):
                n_trial_indices = (trial_color_indexptr[1 + trial_index] -
                                   trial_color_indexptr[trial_index])
                trial_offset = trial_color_indexptr[trial_index]
                kernel_runner(queue, test_offset, trial_offset, n_test_indices,
                              n_trial_indices)
        _cl.enqueue_copy(queue, result, result_buffer)
Exemplo n.º 27
0
    def run(self,
            input,
            silent=True,
            deflicker=False,
            sidebyside=False,
            output_img=None,
            output_vid=None,
            nframes=10):
        mf = cl.mem_flags
        ctx = cl.Context(self.device)
        cmd_queue = cl.CommandQueue(ctx)

        prg = cl.Program(ctx, self.src).build()

        capture = cv.VideoCapture(input)

        if capture.isOpened():
            fimg = capture.read()[1]
            fimg = fimg.astype(np.float32)

            fps = capture.get(cv.CAP_PROP_FPS)
            vidout = None
            if output_vid is not None:
                vidout = cv.VideoWriter(output_vid,
                                        cv.VideoWriter_fourcc(*"mp4v"), fps,
                                        fimg.shape[:2][::-1])

            weight = cl.Buffer(ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR
                               | mf.HOST_NO_ACCESS,
                               hostbuf=np.float32(self.weight * fps))
            threshold = cl.Buffer(ctx,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR
                                  | mf.HOST_NO_ACCESS,
                                  hostbuf=np.float32(self.threshold))
            jweight = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR
                                | mf.HOST_NO_ACCESS,
                                hostbuf=np.int32(self.join_weight))

            width = cl.Buffer(ctx,
                              mf.READ_ONLY | mf.COPY_HOST_PTR
                              | mf.HOST_NO_ACCESS,
                              hostbuf=np.int32(fimg.shape[1]))
            height = cl.Buffer(ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR
                               | mf.HOST_NO_ACCESS,
                               hostbuf=np.int32(fimg.shape[0]))

            histogram, img_histogram, lut = (None, None, None)
            if deflicker:
                histogram = cl.Buffer(
                    ctx,
                    mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS,
                    hostbuf=np.zeros(256 * 3).astype(np.int32))
                img_histogram = cl.Buffer(ctx,
                                          mf.WRITE_ONLY | mf.COPY_HOST_PTR,
                                          hostbuf=np.zeros(256 * 3).astype(
                                              np.int32))
                lut = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR
                                | mf.HOST_NO_ACCESS,
                                hostbuf=np.zeros(256 * 3).astype(np.int32))

            img = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=fimg)
            background = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=fimg)

            _, nimg = capture.read()
            new_img = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=nimg.astype(np.float32))

            if deflicker:
                prg.cal_histogram(cmd_queue, fimg.shape[:2], None, histogram,
                                  img, width, height)
                prg.fin_histogram(cmd_queue, (3, ), None, histogram)

            res = np.empty_like(fimg).astype(np.float32)

            try:
                while True:
                    if deflicker:
                        cl.enqueue_fill_buffer(cmd_queue, img_histogram,
                                               np.int32(0), 0, 3 * 4 * 256)
                        prg.cal_histogram(cmd_queue, fimg.shape[:2], None,
                                          img_histogram, new_img, width,
                                          height)
                        prg.fin_histogram(cmd_queue, (3, ), None,
                                          img_histogram)

                        prg.cal_lut(cmd_queue, (3, ), None, histogram,
                                    img_histogram, lut)
                        prg.deflicker(cmd_queue, fimg.shape[:2], None, lut,
                                      new_img, width, height)

                        prg.join_histogram(cmd_queue, (3, ), None, histogram,
                                           img_histogram, jweight)

                    prg.backsub(cmd_queue, fimg.shape[:2], None, img,
                                background, new_img, width, height, weight,
                                threshold)

                    if (not silent) or (vidout is not None):
                        cl.enqueue_copy(cmd_queue, res, background)

                    if vidout is not None:
                        vidout.write(res.astype(np.uint8))

                    if not silent:
                        cv.imshow('background', res.astype(np.uint8))
                        if sidebyside: cv.imshow('real', nimg)
                        if cv.waitKey(1) == 27: break

                    _, nimg = capture.read()
                    if nimg is None: break

                    cl.enqueue_copy(cmd_queue, new_img,
                                    nimg.astype(np.float32))
            except IndexError:
                pass

            cl.enqueue_copy(cmd_queue, res, background)
            if output_img is not None:
                cv.imwrite(output_img, res.astype(np.uint8))

            if vidout is not None:
                vidout.write(res.astype(np.uint8))

            if not silent:
                cv.imshow('background', res.astype(np.uint8))
                cv.waitKey(0)
Exemplo n.º 28
0
def initialise_opencl_object(self,
                             program_src='',
                             command_queue=None,
                             interactive=False,
                             platform_pref=None,
                             device_pref=None,
                             default_group_size=None,
                             default_num_groups=None,
                             default_tile_size=None,
                             default_reg_tile_size=None,
                             default_threshold=None,
                             size_heuristics=[],
                             required_types=[],
                             all_sizes={},
                             user_sizes={}):
    if command_queue is None:
        self.ctx = get_prefered_context(interactive, platform_pref,
                                        device_pref)
        self.queue = cl.CommandQueue(self.ctx)
    else:
        self.ctx = command_queue.context
        self.queue = command_queue
    self.device = self.queue.device
    self.platform = self.device.platform
    self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))
    device_type = self.device.type

    check_types(self, required_types)

    max_group_size = int(self.device.max_work_group_size)
    max_tile_size = int(np.sqrt(self.device.max_work_group_size))

    self.max_group_size = max_group_size
    self.max_tile_size = max_tile_size
    self.max_threshold = 0
    self.max_num_groups = 0

    self.max_local_memory = int(self.device.local_mem_size)

    # Futhark reserves 4 bytes of local memory for its own purposes.
    self.max_local_memory -= 4

    # See comment in rts/c/opencl.h.
    if self.platform.name.find('NVIDIA CUDA') >= 0:
        self.max_local_memory -= 12
    elif self.platform.name.find('AMD') >= 0:
        self.max_local_memory -= 16

    self.free_list = {}

    self.global_failure = self.pool.allocate(np.int32().itemsize)
    cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0,
                           np.int32().itemsize)
    self.global_failure_args = self.pool.allocate(
        np.int64().itemsize * (self.global_failure_args_max + 1))
    self.failure_is_an_option = np.int32(0)

    if 'default_group_size' in sizes:
        default_group_size = sizes['default_group_size']
        del sizes['default_group_size']

    if 'default_num_groups' in sizes:
        default_num_groups = sizes['default_num_groups']
        del sizes['default_num_groups']

    if 'default_tile_size' in sizes:
        default_tile_size = sizes['default_tile_size']
        del sizes['default_tile_size']

    if 'default_reg_tile_size' in sizes:
        default_reg_tile_size = sizes['default_reg_tile_size']
        del sizes['default_reg_tile_size']

    if 'default_threshold' in sizes:
        default_threshold = sizes['default_threshold']
        del sizes['default_threshold']

    default_group_size_set = default_group_size != None
    default_tile_size_set = default_tile_size != None
    default_sizes = apply_size_heuristics(
        self, size_heuristics, {
            'group_size': default_group_size,
            'tile_size': default_tile_size,
            'reg_tile_size': default_reg_tile_size,
            'num_groups': default_num_groups,
            'lockstep_width': None,
            'threshold': default_threshold
        })
    default_group_size = default_sizes['group_size']
    default_num_groups = default_sizes['num_groups']
    default_threshold = default_sizes['threshold']
    default_tile_size = default_sizes['tile_size']
    default_reg_tile_size = default_sizes['reg_tile_size']
    lockstep_width = default_sizes['lockstep_width']

    if default_group_size > max_group_size:
        if default_group_size_set:
            sys.stderr.write(
                'Note: Device limits group size to {} (down from {})\n'.format(
                    max_tile_size, default_group_size))
        default_group_size = max_group_size

    if default_tile_size > max_tile_size:
        if default_tile_size_set:
            sys.stderr.write(
                'Note: Device limits tile size to {} (down from {})\n'.format(
                    max_tile_size, default_tile_size))
        default_tile_size = max_tile_size

    for (k, v) in user_sizes.items():
        if k in all_sizes:
            all_sizes[k]['value'] = v
        else:
            raise Exception('Unknown size: {}\nKnown sizes: {}'.format(
                k, ' '.join(all_sizes.keys())))

    self.sizes = {}
    for (k, v) in all_sizes.items():
        if v['class'] == 'group_size':
            max_value = max_group_size
            default_value = default_group_size
        elif v['class'] == 'num_groups':
            max_value = max_group_size  # Intentional!
            default_value = default_num_groups
        elif v['class'] == 'tile_size':
            max_value = max_tile_size
            default_value = default_tile_size
        elif v['class'] == 'reg_tile_size':
            max_value = None
            default_value = default_reg_tile_size
        elif v['class'].startswith('threshold'):
            max_value = None
            default_value = default_threshold
        else:
            # Bespoke sizes have no limit or default.
            max_value = None
        if v['value'] == None:
            self.sizes[k] = default_value
        elif max_value != None and v['value'] > max_value:
            sys.stderr.write(
                'Note: Device limits {} to {} (down from {}\n'.format(
                    k, max_value, v['value']))
            self.sizes[k] = max_value
        else:
            self.sizes[k] = v['value']

    # XXX: we perform only a subset of z-encoding here.  Really, the
    # compiler should provide us with the variables to which
    # parameters are mapped.
    if (len(program_src) >= 0):
        return cl.Program(
            self.ctx, program_src
        ).build(["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] + [
            "-D{}={}".format(
                s.replace('z', 'zz').replace('.', 'zi').replace('#', 'zh'), v)
            for (s, v) in self.sizes.items()
        ])
Exemplo n.º 29
0
def test_overwrite_efb():
    cl.enqueue_fill_buffer(queue, cl_empty_buffer, zero, 0,
                           zero_buffer.nbytes).wait()
Exemplo n.º 30
0
    def Solve(self, t, dt):

        # start = timer()

        cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0,
                               self.LM.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0,
                               self.LM.nbytes)

        # end = timer()
        # print('--- Rank: {} time 0: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        calc_Ku_events = []
        for iColorGrp in range(len(self.colorGps_buf)):
            calc_Ku_event = \
            self.program.assemble_K_P(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                                      np.int64(len(self.mesh.colorGroups[iColorGrp])),
                                      np.int64(self.nSmp), np.float64(self.pressure),
                                      self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGrp],
                                      self.elmTE_buf[iColorGrp], self.u_buf, self.Ku_buf, self.P_buf,
                                      wait_for=calc_Ku_events)
            calc_Ku_events = [calc_Ku_event]

        # end = timer()
        # print('--- Rank: {} time 1: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        calc_u_event = \
        self.program.calc_u(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                            np.int64(self.nSmp), np.int64(self.lclNDof),
                            np.float64(dt), np.float64(self.damp),
                            self.P_buf, self.Ku_buf, self.LM_buf, self.LHS_buf,
                            self.u_buf, self.up_buf, self.ures_buf, wait_for=[calc_Ku_event])
        # calc_u_event.wait() # TODO:: Comment off after debugging

        # end = timer()
        # print('--- Rank: {} time 2: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        ures_copy_event = cl.enqueue_copy(self.queue,
                                          self.srcURes[:self.lclNCommDof],
                                          self.ures_buf,
                                          wait_for=[calc_u_event])
        # ures_copy_event.wait()

        # end = timer()
        # print('--- Rank: {} time 3: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        # Synchronize the ures.
        self.SyncCommNodes(self.srcURes)

        # end = timer()
        # print('--- Rank: {} time 4: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        # Apply boundary condition.
        self.ApplyBoundaryCondition(self.srcURes)
        # Enforce the applied boundary condition back to GPU. <lclNSpecialHeadDof>
        update_u_event = cl.enqueue_copy(
            self.queue, self.ures_buf, self.srcURes[:self.lclNSpecialHeadDof])

        # Add on the global force.
        appTrac_copy_event = cl.enqueue_copy(self.queue, self.appTrac_buf,
                                             self.appTrac)
        calc_u_event = \
        self.program.calc_u_appTrac(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                            np.int64(self.nSmp), np.int64(self.lclNDof), np.float64(dt),
                            self.LHS_buf, self.appTrac_buf, self.ures_buf,
                            wait_for=[update_u_event, appTrac_copy_event])
        calc_u_event.wait()

        # end = timer()
        # print('--- Rank: {} time 5: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0))
        # start = timer()

        # Update/Shift the pointers.
        self.srcURes, self.srcU, self.srcUP = self.srcUP, self.srcURes, self.srcU
        self.ures_buf, self.u_buf, self.up_buf = self.up_buf, self.ures_buf, self.u_buf
Exemplo n.º 31
0
prg = cl.Program(
    ctx, """
__kernel void sum(__global float *a_g, __global unsigned int *b_g) {
    int tid = get_local_id(0);
    a_g[tid] = tid;
}
""").build()

# run once, just to make sure the buffer copied to gpu
q.finish()
start = time.time()

prg.sum(q, (N, ), (N, ), a_gpu, b_gpu)

cl.enqueue_fill_buffer(q, a_gpu, np.float32(0), 0, 8 * 4)

q.finish()
end = time.time()
print('kernel done')
print('diff', end - start)

a_doubled = np.empty_like(a)
cl.enqueue_copy(q, a_doubled, a_gpu)

b_doubled = np.empty_like(b, dtype=np.uint32)
cl.enqueue_copy(q, b_doubled, b_gpu)

print('a', a)
print('a_doubled', a_doubled)
print('b_doubled', b_doubled)
Exemplo n.º 32
0
def test_compute_bounds(cl_env, kernels, coord_dtype):
    ctx, cq = cl_env

    coords = np.array([[ 0.0, 1.0, 3.0],
                       [ 4.0, 1.0, 8.0],
                       [-4.0,-6.0, 3.0],
                       [-5.0, 0.0,-1.0]], dtype=coord_dtype)
    radii = np.ones(len(coords), dtype=coord_dtype)
    leaf = len(coords) - 1
    nodes = np.array([(-1, 3, [leaf+0, 1]),
                      ( 0, 3, [leaf+3, 2]),
                      ( 1, 2, [leaf+1, leaf+2]),
                      ( 0, 0, [2, -1]),
                      ( 2, 1, [0, -1]),
                      ( 2, 2, [1, -1]),
                      ( 1, 3, [3, -1])], dtype=Node)

    coords_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize
    )
    (coords_map, _) = cl.enqueue_map_buffer(
        cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION,
        0, (len(coords), 4), coord_dtype,
        is_blocking=True
    )
    coords_map[..., :3] = coords
    del coords_map
    radii_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii
    )
    nodes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=nodes
    )
    bounds_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, len(nodes) * 4 * 2 * coords.dtype.itemsize
    )
    flags_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, len(nodes) * np.dtype('uint32').itemsize
    )

    clear_flags = cl.enqueue_fill_buffer(
        cq, flags_buf, np.zeros(1, dtype='uint32'),
        0, len(nodes) * np.dtype('uint32').itemsize
    )
    calc_leaf_bounds = kernels['leafBounds'](
        cq, (roundUp(len(coords), 32),), None,
        bounds_buf, coords_buf, radii_buf, nodes_buf, len(coords),
    )
    calc_bounds = kernels['internalBounds'](
        cq, (roundUp(len(coords), 32),), None,
        bounds_buf, flags_buf, nodes_buf, len(coords),
        wait_for=[calc_leaf_bounds, clear_flags]
    )
    (bounds_map, _) = cl.enqueue_map_buffer(
        cq, bounds_buf, cl.map_flags.READ,
        0, (len(nodes), 2, 4), coord_dtype,
        wait_for=[calc_bounds], is_blocking=True
    )

    expected = np.array([[[-6.0,-7.0,-2.0], [ 5.0, 2.0, 9.0]],
                         [[-6.0,-1.0,-2.0], [ 5.0, 2.0, 9.0]],
                         [[-1.0, 0.0, 2.0], [ 5.0, 2.0, 9.0]],
                         [[-5.0,-7.0, 2.0], [-3.0,-5.0, 4.0]],
                         [[-1.0, 0.0, 2.0], [ 1.0, 2.0, 4.0]],
                         [[ 3.0, 0.0, 7.0], [ 5.0, 2.0, 9.0]],
                         [[-6.0,-1.0,-2.0], [-4.0, 1.0, 0.0]]], dtype=coord_dtype)
    np.testing.assert_equal(bounds_map[:, :, :3], expected)
Exemplo n.º 33
0
def test_traverse(cl_env, kernels, coord_dtype):
    ctx, cq = cl_env

    coords = np.array([[ 0.0, 1.0, 3.0],
                       [ 0.0, 1.0, 3.0],
                       [ 4.0, 1.0, 8.0],
                       [-4.0,-6.0, 3.0],
                       [-5.0, 0.0,-1.0],
                       [-5.0, 0.5,-0.5]], dtype=coord_dtype)
    radii = np.ones(len(coords), dtype=coord_dtype)
    n_nodes = len(coords) * 2 - 1

    coords_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize
    )
    (coords_map, _) = cl.enqueue_map_buffer(
        cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION,
        0, (len(coords), 4), coord_dtype,
        is_blocking=True
    )
    coords_map[..., :3] = coords
    del coords_map
    range_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
        hostbuf=np.array([coords.min(axis=0), coords.max(axis=0)], dtype=coords.dtype)
    )
    codes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, len(coords) * np.dtype('uint32').itemsize
    )
    radii_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii
    )
    nodes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, n_nodes * Node.itemsize
    )
    bounds_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, n_nodes * 4 * 2 * coords.dtype.itemsize
    )
    flags_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, n_nodes * np.dtype('uint32').itemsize
    )
    n_collisions = 2
    collisions_buf = cl.Buffer(
        ctx, cl.mem_flags.WRITE_ONLY, n_collisions * 2 * np.dtype('uint32').itemsize
    )
    n_collisions_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, np.dtype('uint32').itemsize
    )

    calc_codes = kernels['calculateCodes'](
        cq, (roundUp(len(coords), 32),), None,
        codes_buf, coords_buf, range_buf, len(coords),
    )

    # Would use radix sort
    (codes_map, _) = cl.enqueue_map_buffer(
        cq, codes_buf, cl.map_flags.READ | cl.map_flags.WRITE,
        0, (len(coords),), np.dtype('uint32'),
        wait_for=[calc_codes], is_blocking=True
    )
    order = np.argsort(codes_map, kind='mergesort').astype('uint32')
    codes_map[...] = codes_map[order]
    del codes_map

    ids_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=order
    )

    fill_internal = kernels['fillInternal'](
        cq, (roundUp(len(coords), 32),), None,
        nodes_buf, ids_buf, len(coords),
    )
    generate_bvh = kernels['generateBVH'](
        cq, (roundUp(len(coords)-1, 32),), None,
        codes_buf, nodes_buf, len(coords),
        wait_for=[calc_codes, fill_internal]
    )
    clear_flags = cl.enqueue_fill_buffer(
        cq, flags_buf, np.zeros(1, dtype='uint32'),
        0, n_nodes * np.dtype('uint32').itemsize
    )
    calc_bounds = kernels['leafBounds'](
        cq, (roundUp(len(coords), 32),), None,
        bounds_buf, coords_buf, radii_buf, nodes_buf, len(coords),
        wait_for=[generate_bvh]
    )
    calc_bounds = kernels['internalBounds'](
        cq, (roundUp(len(coords), 32),), None,
        bounds_buf, flags_buf, nodes_buf, len(coords),
        wait_for=[clear_flags, calc_bounds]
    )
    clear_collisions = cl.enqueue_fill_buffer(
        cq, collisions_buf, np.array([-1], dtype='uint32'),
        0, n_collisions * 2 * np.dtype('uint32').itemsize
    )
    clear_n_collisions = cl.enqueue_fill_buffer(
        cq, n_collisions_buf, np.zeros(1, dtype='uint32'),
        0, np.dtype('uint32').itemsize
    )
    find_collisions = kernels['traverse'](
        cq, (roundUp(len(coords), 32),), None,
        collisions_buf, n_collisions_buf, n_collisions,
        nodes_buf, bounds_buf, len(coords),
        wait_for=[clear_collisions, clear_n_collisions, calc_bounds],
    )

    (n_collisions_map, _) = cl.enqueue_map_buffer(
        cq, n_collisions_buf, cl.map_flags.READ,
        0, 1, np.dtype('uint32'),
        wait_for=[find_collisions], is_blocking=True
    )

    assert n_collisions_map[0] == n_collisions

    (collisions_map, _) = cl.enqueue_map_buffer(
        cq, collisions_buf, cl.map_flags.READ,
        0, (n_collisions, 2), np.dtype('uint32'),
        wait_for=[find_collisions], is_blocking=True
    )
    expected = {(0, 1), (4, 5)}
    assert set(map(tuple, collisions_map)) == expected