def main(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) with open(PROGRAM_FILE) as prg_file: prg = cl.Program(ctx, prg_file.read()).build() targets_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=TARGETS_NP) results_np = np.zeros(np.shape(TARGETS_NP)[0], dtype=np.uint32) results_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, results_np.nbytes) cl.enqueue_fill_buffer( queue, results_buf, np.uint32(0), 0, np.dtype(np.uint32).itemsize * np.shape(results_np)[0], None) success_np = np.zeros(np.shape(TARGETS_NP)[0], dtype=np.uint32) success_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, success_np.nbytes) cl.enqueue_fill_buffer( queue, success_buf, np.uint32(0), 0, np.dtype(np.uint32).itemsize * np.shape(success_np)[0], None) #prg.ipv4_hash(queue, (WORKITEM_NB,), (WORKITEM_PER_GROUP,), targets_buf, success_buf, results_buf) prg.ipv4_hash(queue, (WORKITEM_TOTAL // WORKITEM_ITER, ), None, np.uint32(WORKITEM_ITER), targets_buf, success_buf, results_buf) cl.enqueue_copy(queue, results_np, results_buf) cl.enqueue_copy(queue, success_np, success_buf) #queue.finish() print(success_np) print(results_np)
def evaluator(coeffs): """Actually evaluate the near-field correction.""" result = _np.empty(4 * ncoeffs, dtype=result_type) with bempp.api.Timer(message="Singular Corrections Evaluator"): with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_copy(queue, coefficients_buffer, coeffs.astype(result_type)) _cl.enqueue_fill_buffer( queue, result_buffer, _np.uint8(0), 0, result_type.itemsize * ncoeffs, ) kernel( queue, (grid.number_of_elements, ), (1, ), grid_buffer, neighbor_indices_buffer, neighbor_indexptr_buffer, points_buffer, coefficients_buffer, result_buffer, kernel_parameters_buffer, _np.uint32(grid.number_of_elements), ) _cl.enqueue_copy(queue, result, result_buffer) return result
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.zero_args = self.convert_args = None self.launch_args[2:7] = (None,) * 5
def evaluator(x): """Evaluate a potential.""" result = _np.empty(kernel_dimension * npoints, dtype=result_type) with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_copy(queue, coefficients_buffer, x.astype(result_type)) _cl.enqueue_fill_buffer( queue, result_buffer, _np.uint8(0), 0, kernel_dimension * npoints * result_type.itemsize, ) if main_size > 0: _cl.enqueue_fill_buffer(queue, sum_buffer, _np.uint8(0), 0, sum_size) main_kernel( queue, (npoints, main_size // vector_width), (1, WORKGROUP_SIZE_POTENTIAL // vector_width), grid_buffer, indices_buffer, normals_buffer, points_buffer, coefficients_buffer, quad_points_buffer, quad_weights_buffer, sum_buffer, kernel_options_buffer, ) sum_kernel( queue, (kernel_dimension * npoints, ), (1, ), sum_buffer, result_buffer, _np.uint32(nelements // WORKGROUP_SIZE_POTENTIAL), ) if remainder_size > 0: remainder_kernel( queue, (npoints, remainder_size), (1, remainder_size), grid_buffer, indices_buffer, normals_buffer, points_buffer, coefficients_buffer, quad_points_buffer, quad_weights_buffer, result_buffer, kernel_options_buffer, global_offset=(0, main_size), ) _cl.enqueue_copy(queue, result, result_buffer) return result
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.zero_args = self.convert_args = None self.launch_args[2:7] = (None, ) * 5
def _init_normal_memory(self): mem_size = SmithWatermanOcl._init_normal_memory(self) # Input matrix device memory memory = (SmithWaterman.float_size * self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) if self._need_reallocation(self.d_matrix, memory): self.d_matrix = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory if self.gap_extension: if self._need_reallocation(self.d_matrix_i, memory): self.d_matrix_i = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory if self._need_reallocation(self.d_matrix_j, memory): self.d_matrix_j = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory # Maximum global device memory memory = (SmithWaterman.float_size * self.x_div_shared_x * self.number_of_sequences * self.y_div_shared_y * self.number_targets) if self._need_reallocation(self.d_global_maxima, memory): self.d_global_maxima = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) if self._need_reallocation(self.d_global_direction, memory): self.d_global_direction = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=memory) mem_size += memory memory = SmithWaterman.int_size if self._need_reallocation(self.d_is_traceback_required, memory): self.d_is_traceback_required = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) flag = numpy.zeros((1), dtype=numpy.uint32) cl.enqueue_fill_buffer(self.queue, self.d_is_traceback_required, flag, 0, size=memory) return mem_size
def sync(self): failure = np.empty(1, dtype=np.int32) cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True) self.failure_is_an_option = np.int32(0) if failure[0] >= 0: # Reset failure information. cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) # Read failure args. failure_args = np.empty(self.global_failure_args_max+1, dtype=np.int32) cl.enqueue_copy(self.queue, failure_args, self.global_failure_args, is_blocking=True) raise Exception(self.failure_msgs[failure[0]].format(*failure_args))
def core_search(partial_outputs, a): search_len_np = np.array(args.max_skip, dtype=np.uint32) result_count_cl = cl.Buffer(ctx, mem_flags.READ_WRITE, uint32_size) result_cl = cl.Buffer(ctx, mem_flags.WRITE_ONLY, max_results*uint32_size) cl.enqueue_fill_buffer(queue, result_count_cl, "\x00", 0, uint32_size) result_max_np = np.array(max_results, dtype=np.uint32) a_np = np.array(a, dtype=np.uint32) outputs_np = np.array(partial_outputs, dtype=np.uint32) outputs_cl = cl.Buffer(ctx, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=outputs_np) outputs_len_np = np.array(len(outputs_np), dtype=np.uint32) program.node_newer_rng(queue, [1<<16], None, search_len_np, result_max_np, a_np, outputs_cl, outputs_len_np, result_count_cl, result_cl) return result_count_cl, result_cl
def memset(self, buffer, value, size): """set the memory in allocation to the value in value :param allocation: An OpenCL Buffer to fill :type allocation: pyopencl.Buffer :param value: The value to set the memory to :type value: a single 32-bit int :param size: The size of to the allocation unit in bytes :type size: int """ if isinstance(buffer, cl.Buffer): cl.enqueue_fill_buffer(self.queue, buffer, numpy.uint32(value), 0, size)
def test_scatter(cl_env, value_dtype, index_dtype): ctx, cq = cl_env size = 240 nindices = 30 indexer = Indexer(ctx, value_dtype, index_dtype) values = (np.random.uniform(0, 1000, (nindices,) + value_dtype.shape) .astype(value_dtype.base)) indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype) values_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values ) values_out_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, size * value_dtype.itemsize ) index_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices ) e = cl.enqueue_fill_buffer( cq, values_out_buf, np.full(1, 1.0, value_dtype), 0, size * value_dtype.itemsize ) e = indexer.scatter(cq, nindices, values_buf, index_buf, values_out_buf, wait_for=[e]) (values_map, _) = cl.enqueue_map_buffer( cq, values_out_buf, cl.map_flags.READ, 0, (size,) + value_dtype.shape, value_dtype.base, wait_for=[e], is_blocking=True ) selection = np.zeros(size, dtype='bool') selection[indices] = True np.testing.assert_equal(values_map[indices], values) np.testing.assert_equal(values_map[~selection], 1.0)
def run( self, idx, runev ): fillev = cl.enqueue_fill_buffer( self.queue_, self.retCnt_[ idx ], np.uint8(0), 0, 4 ) runev = cl.enqueue_nd_range_kernel( self.queue_, self.kernel_[ idx ], (self.threadCount_,), None, wait_for=[fillev, runev] ) self.queue_.flush() return runev
def fill(self, value): if not isinstance(value, np.ndarray): value = np.array(value) if len(value.shape) == 0: value = value.astype(self.gtype.dtype) cl.enqueue_fill_buffer(queue, self.buffer, value, 0, self.gtype.elemsize * prod(self.shape)) elif value.dtype != self.gtype.dtype: raise ValueError( "Matrix fill error: the given host buffer is not in the same data type with the matrix" ) elif value.shape != self.shape: raise ValueError( "Matrix fill error: the given host buffer is not in the same shape with the matrix" ) else: cl.enqueue_copy(queue, self.buffer, value)
def pol3d_to_slice2d(self, P1, P2): S = np.zeros((self.row_len, self.col_len), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_P1, P1) cl.enqueue_copy(self.queue, self.buf_P2, P2) cl.enqueue_fill_buffer(self.queue, self.buf_S, np.float64(0.0), 0, S.nbytes) self.prg.pol3d_to_slice2d(self.queue, (self.row_len, self.col_len), None, self.buf_fmt, self.buf_P1, self.buf_P2, self.buf_S) cl.enqueue_copy(self.queue, S, self.buf_S) cl.enqueue_barrier(self.queue).wait() return S
def pol3d_to_cart2d(self, P1, P2): M = np.zeros((self.row_len, self.col_len), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_P1, P1) cl.enqueue_copy(self.queue, self.buf_P2, P2) cl.enqueue_fill_buffer(self.queue, self.buf_M, np.float64(0.0), 0, M.nbytes) self.prg.pol3d_to_cart2d(self.queue, (self.row_len, self.col_len), None, self.buf_fmt, self.buf_P1, self.buf_P2, self.buf_M) cl.enqueue_copy(self.queue, M, self.buf_M) cl.enqueue_barrier(self.queue).wait() return M
def _is_traceback_required(self): '''Returns False if it is known after calculating scores that there are no possible starting points, hence no need to run traceback. ''' flag = numpy.zeros((1), dtype=numpy.uint32) cl.enqueue_copy(self.queue, flag, self.d_is_traceback_required) if flag[0]: # Clear the flag flag[0] = 0 cl.enqueue_fill_buffer(self.queue, self.d_is_traceback_required, flag, 0, size=SmithWaterman.int_size) return True else: return False
def fillBuf(self, bufname, val, wait_for=None): self.log("fillBuf " + bufname) buf = self.bufs[bufname] buftype = np.dtype(self.buf_spec[bufname][0]).type self.logevt('fill_buffer', cl.enqueue_fill_buffer(self.queue, buf, buftype(val), 0, buf.size, wait_for=self._waitevt()))
def core_search(partial_outputs, a): search_len_np = np.array(args.max_skip, dtype=np.uint32) result_count_cl = cl.Buffer(ctx, mem_flags.READ_WRITE, uint32_size) result_cl = cl.Buffer(ctx, mem_flags.WRITE_ONLY, max_results * uint32_size) cl.enqueue_fill_buffer(queue, result_count_cl, "\x00", 0, uint32_size) result_max_np = np.array(max_results, dtype=np.uint32) a_np = np.array(a, dtype=np.uint32) outputs_np = np.array(partial_outputs, dtype=np.uint32) outputs_cl = cl.Buffer(ctx, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=outputs_np) outputs_len_np = np.array(len(outputs_np), dtype=np.uint32) program.node_newer_rng(queue, [1 << 16], None, search_len_np, result_max_np, a_np, outputs_cl, outputs_len_np, result_count_cl, result_cl) return result_count_cl, result_cl
def Solve(self, t, dt): cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0, self.LM.nbytes) cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0, self.LM.nbytes) update_u_event = cl.enqueue_copy(self.queue, self.u_buf, self.srcU) # update_appTrac_event = cl.enqueue_copy(self.queue, self.appTrac_buf, self.appTrac) calc_Ku_events = [update_u_event] for iColorGrp in range(len(self.colorGps_buf)): calc_Ku_event = \ self.program.assemble_K_P(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(len(self.mesh.colorGroups[iColorGrp])), np.int64(self.nNodes), np.int64(self.nSmp), np.float64(self.pressure), self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGrp], self.elmTE_buf[iColorGrp], self.u_buf, self.Ku_buf, self.P_buf, wait_for=calc_Ku_events) calc_Ku_events = [calc_Ku_event] calc_u_event = \ self.program.calc_u(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(self.nSmp), np.int64(self.ndof), np.float64(dt), self.P_buf, self.Ku_buf, self.LM_buf, self.LHS_buf, self.u_buf, self.up_buf, self.ures_buf, wait_for=[calc_Ku_event]) ures_copy_event = cl.enqueue_copy(self.queue, self.srcURes, self.ures_buf, wait_for=[calc_u_event]) ures_copy_event.wait() # Synchronize the ures. self.SyncCommNodes(self.srcURes) # Add on the global force. self.srcURes[:, :] += dt * dt * self.appTrac.reshape(self.ndof, 1) / self.LHS # Apply boundary condition. self.ApplyBoundaryCondition(self.srcURes) # Update/Shift the pointers. self.srcURes, self.srcU, self.srcUP = self.srcUP, self.srcURes, self.srcU self.ures_buf, self.u_buf, self.up_buf = self.up_buf, self.ures_buf, self.u_buf
def memset(self, buffer, value, size): """set the memory in allocation to the value in value :param allocation: An OpenCL Buffer to fill :type allocation: pyopencl.Buffer :param value: The value to set the memory to :type value: a single 32-bit int :param size: The size of to the allocation unit in bytes :type size: int """ if isinstance(buffer, cl.Buffer): try: cl.enqueue_fill_buffer(self.queue, buffer, np.uint32(value), 0, size) except AttributeError: src=np.zeros(size, dtype='uint8')+np.uint8(value) cl.enqueue_copy(self.queue, buffer, src)
def memset(self, buffer, value, size): """set the memory in allocation to the value in value :param allocation: An OpenCL Buffer to fill :type allocation: pyopencl.Buffer :param value: The value to set the memory to :type value: a single 32-bit int :param size: The size of to the allocation unit in bytes :type size: int """ if isinstance(buffer, cl.Buffer): try: cl.enqueue_fill_buffer(self.queue, buffer, numpy.uint32(value), 0, size) except AttributeError: src=numpy.zeros(size, dtype='uint8')+numpy.uint8(value) cl.enqueue_copy(self.queue, buffer, src)
def cart2d_to_pol2d(self, M): Q1 = np.zeros((len(self.R), ), dtype=np.float64) Q2 = np.zeros((len(self.R), len(self.A)), dtype=np.float64) norm = np.zeros((len(self.R), ), dtype=np.float64) cl.enqueue_copy(self.queue, self.buf_M, M) cl.enqueue_fill_buffer(self.queue, self.buf_Q1e, np.float64(0.0), 0, Q1.nbytes) cl.enqueue_fill_buffer(self.queue, self.buf_Q2e, np.float64(0.0), 0, Q2.nbytes) cl.enqueue_fill_buffer(self.queue, self.buf_norm, np.float64(0.0), 0, norm.nbytes) self.prg.cart2d_to_pol2d_project(self.queue, (self.r_len, 1), None, self.buf_fmt, self.buf_M, self.buf_Q1e, self.buf_Q2e, self.buf_norm) cl.enqueue_copy(self.queue, Q1, self.buf_Q1e) cl.enqueue_copy(self.queue, Q2, self.buf_Q2e) cl.enqueue_copy(self.queue, norm, self.buf_norm) cl.enqueue_barrier(self.queue).wait() Q1 /= norm.sum() return Q1, Q2
def calculate_deviations(self, xsiz, zsiz, pix2nm, dt, u, g, b, c2d, d2c, nu, phi, psi, theta): #fills up sxz array #set up buffers to pass between python and C++ buf_size = xsiz * (zsiz + 1) * 4 mf = cl.mem_flags self.sxz_buf = cl.Buffer(self.ctx, mf.READ_WRITE, buf_size) out_buf_2 = cl.Buffer(self.ctx, mf.READ_WRITE, buf_size) c2d_buf = cl.Buffer(self.ctx, mf.READ_ONLY, c2d.size * 4) d2c_buf = cl.Buffer(self.ctx, mf.READ_ONLY, d2c.size * 4) shape = np.array([(zsiz + 1), xsiz], dtype=np.int32) # set up contiguous buffers c2d_ = np.ascontiguousarray(c2d.ravel().astype(np.float32)) cl.enqueue_copy(self.queue, c2d_buf, c2d_) d2c_ = np.ascontiguousarray(d2c.ravel().astype(np.float32)) cl.enqueue_copy(self.queue, d2c_buf, d2c_) # fill with zeros as we add to initial buffer (makes handling the 2 bs easier) cl.enqueue_fill_buffer(self.queue, self.sxz_buf, np.float32(0.0), 0, buf_size) cl.enqueue_fill_buffer(self.queue, out_buf_2, np.float32(0.0), 0, buf_size) # the actual calculation #small change in z-coordinate to get derivative dz = 0.0 #calculate x-z array of displacements self.displace_r(shape, self.sxz_buf, c2d_buf, d2c_buf, pix2nm, u, g, b, c2d, nu, phi, psi, theta, dt, dz) # calculate second array at a small z-shift dz = 0.01 self.displace_r(shape, out_buf_2, c2d_buf, d2c_buf, pix2nm, u, g, b, c2d, nu, phi, psi, theta, dt, dz) # subtract one from the other to get the gradient dz_32 = np.float32(dz * dt) self.disp_r_prog.difference(self.queue, shape, None, self.sxz_buf, out_buf_2, dz_32)
def find_offsets(self, cq, values_buf, n_values, offsets_buf, n_offsets, wait_for=None): wait_for = wait_for or [] fill_offsets = cl.enqueue_fill_buffer( cq, offsets_buf, array(n_values, dtype=self.program.offset_dtype), 0, n_offsets * self.program.offset_dtype.itemsize, wait_for=wait_for) return self.program.kernels['find_offsets']( cq, (n_values - 1, ), None, values_buf, offsets_buf, wait_for=[fill_offsets], )
def forward(ctx, t, val): val = t.dtype.type(val) cl.enqueue_fill_buffer(t.device.queue, t.data, val, t.offset * t.dtype.itemsize, t.numel() * t.dtype.itemsize) return t
def enqueue_zero_buffer(self, buffer: pyopencl.array.Array) -> None: pyopencl.enqueue_fill_buffer(self._pyopencl_command_queue, buffer.data, np.uint8(0), 0, buffer.data.get_info(pyopencl.mem_info.SIZE))
def dense_assembler(device_interface, operator_descriptor, domain, dual_to_range, parameters, result): """Assemble dense with OpenCL.""" import bempp.api from bempp.api.integration.triangle_gauss import rule from bempp.api.utils.helpers import get_type from bempp.core.opencl_kernels import get_kernel_from_operator_descriptor from bempp.core.opencl_kernels import ( default_context, default_device, get_vector_width, ) if bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "gpu": device_type = "gpu" elif bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "cpu": device_type = "cpu" else: raise RuntimeError( f"Unknown device type {bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE}") mf = _cl.mem_flags ctx = default_context(device_type) device = default_device(device_type) precision = operator_descriptor.precision dtype = get_type(precision).real kernel_options = operator_descriptor.options quad_points, quad_weights = rule(parameters.quadrature.regular) test_indices, test_color_indexptr = dual_to_range.get_elements_by_color() trial_indices, trial_color_indexptr = domain.get_elements_by_color() number_of_test_colors = len(test_color_indexptr) - 1 number_of_trial_colors = len(trial_color_indexptr) - 1 options = { "NUMBER_OF_QUAD_POINTS": len(quad_weights), "TEST": dual_to_range.shapeset.identifier, "TRIAL": domain.shapeset.identifier, "TRIAL_NUMBER_OF_ELEMENTS": domain.number_of_support_elements, "TEST_NUMBER_OF_ELEMENTS": dual_to_range.number_of_support_elements, "NUMBER_OF_TEST_SHAPE_FUNCTIONS": dual_to_range.number_of_shape_functions, "NUMBER_OF_TRIAL_SHAPE_FUNCTIONS": domain.number_of_shape_functions, } if operator_descriptor.is_complex: options["COMPLEX_KERNEL"] = None main_kernel = get_kernel_from_operator_descriptor(operator_descriptor, options, "regular", device_type=device_type) remainder_kernel = get_kernel_from_operator_descriptor( operator_descriptor, options, "regular", force_novec=True, device_type=device_type, ) test_indices_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=test_indices) trial_indices_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=trial_indices) test_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.normal_multipliers) trial_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.normal_multipliers) test_grid_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.grid.as_array.astype(dtype), ) trial_grid_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.grid.as_array.astype(dtype)) test_elements_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.grid.elements.ravel(order="F"), ) trial_elements_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.grid.elements.ravel(order="F"), ) test_local2global_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.local2global) trial_local2global_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.local2global) test_multipliers_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.local_multipliers.astype(dtype), ) trial_multipliers_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.local_multipliers.astype(dtype), ) quad_points_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_points.ravel(order="F").astype(dtype), ) quad_weights_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_weights.astype(dtype)) result_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=result.nbytes) if not kernel_options: kernel_options = [0.0] kernel_options_array = _np.array(kernel_options, dtype=dtype) kernel_options_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_options_array) vector_width = get_vector_width(precision, device_type=device_type) def kernel_runner( queue, test_offset, trial_offset, test_number_of_indices, trial_number_of_indices, ): """Actually run the kernel for a given range.""" remainder_size = trial_number_of_indices % vector_width main_size = trial_number_of_indices - remainder_size buffers = [ test_indices_buffer, trial_indices_buffer, test_normals_buffer, trial_normals_buffer, test_grid_buffer, trial_grid_buffer, test_elements_buffer, trial_elements_buffer, test_local2global_buffer, trial_local2global_buffer, test_multipliers_buffer, trial_multipliers_buffer, quad_points_buffer, quad_weights_buffer, result_buffer, kernel_options_buffer, _np.int32(dual_to_range.global_dof_count), _np.int32(domain.global_dof_count), _np.uint8(domain.grid != dual_to_range.grid), ] if main_size > 0: main_kernel( queue, (test_number_of_indices, main_size // vector_width), (1, 1), *buffers, global_offset=(test_offset, trial_offset), ) if remainder_size > 0: remainder_kernel( queue, (test_number_of_indices, remainder_size), (1, 1), *buffers, global_offset=(test_offset, trial_offset + main_size), ) with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_fill_buffer(queue, result_buffer, _np.uint8(0), 0, result.nbytes) for test_index in range(number_of_test_colors): test_offset = test_color_indexptr[test_index] n_test_indices = (test_color_indexptr[1 + test_index] - test_color_indexptr[test_index]) for trial_index in range(number_of_trial_colors): n_trial_indices = (trial_color_indexptr[1 + trial_index] - trial_color_indexptr[trial_index]) trial_offset = trial_color_indexptr[trial_index] kernel_runner(queue, test_offset, trial_offset, n_test_indices, n_trial_indices) _cl.enqueue_copy(queue, result, result_buffer)
def run(self, input, silent=True, deflicker=False, sidebyside=False, output_img=None, output_vid=None, nframes=10): mf = cl.mem_flags ctx = cl.Context(self.device) cmd_queue = cl.CommandQueue(ctx) prg = cl.Program(ctx, self.src).build() capture = cv.VideoCapture(input) if capture.isOpened(): fimg = capture.read()[1] fimg = fimg.astype(np.float32) fps = capture.get(cv.CAP_PROP_FPS) vidout = None if output_vid is not None: vidout = cv.VideoWriter(output_vid, cv.VideoWriter_fourcc(*"mp4v"), fps, fimg.shape[:2][::-1]) weight = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.float32(self.weight * fps)) threshold = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.float32(self.threshold)) jweight = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.int32(self.join_weight)) width = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.int32(fimg.shape[1])) height = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.int32(fimg.shape[0])) histogram, img_histogram, lut = (None, None, None) if deflicker: histogram = cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.zeros(256 * 3).astype(np.int32)) img_histogram = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=np.zeros(256 * 3).astype( np.int32)) lut = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR | mf.HOST_NO_ACCESS, hostbuf=np.zeros(256 * 3).astype(np.int32)) img = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=fimg) background = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=fimg) _, nimg = capture.read() new_img = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=nimg.astype(np.float32)) if deflicker: prg.cal_histogram(cmd_queue, fimg.shape[:2], None, histogram, img, width, height) prg.fin_histogram(cmd_queue, (3, ), None, histogram) res = np.empty_like(fimg).astype(np.float32) try: while True: if deflicker: cl.enqueue_fill_buffer(cmd_queue, img_histogram, np.int32(0), 0, 3 * 4 * 256) prg.cal_histogram(cmd_queue, fimg.shape[:2], None, img_histogram, new_img, width, height) prg.fin_histogram(cmd_queue, (3, ), None, img_histogram) prg.cal_lut(cmd_queue, (3, ), None, histogram, img_histogram, lut) prg.deflicker(cmd_queue, fimg.shape[:2], None, lut, new_img, width, height) prg.join_histogram(cmd_queue, (3, ), None, histogram, img_histogram, jweight) prg.backsub(cmd_queue, fimg.shape[:2], None, img, background, new_img, width, height, weight, threshold) if (not silent) or (vidout is not None): cl.enqueue_copy(cmd_queue, res, background) if vidout is not None: vidout.write(res.astype(np.uint8)) if not silent: cv.imshow('background', res.astype(np.uint8)) if sidebyside: cv.imshow('real', nimg) if cv.waitKey(1) == 27: break _, nimg = capture.read() if nimg is None: break cl.enqueue_copy(cmd_queue, new_img, nimg.astype(np.float32)) except IndexError: pass cl.enqueue_copy(cmd_queue, res, background) if output_img is not None: cv.imwrite(output_img, res.astype(np.uint8)) if vidout is not None: vidout.write(res.astype(np.uint8)) if not silent: cv.imshow('background', res.astype(np.uint8)) cv.waitKey(0)
def initialise_opencl_object(self, program_src='', command_queue=None, interactive=False, platform_pref=None, device_pref=None, default_group_size=None, default_num_groups=None, default_tile_size=None, default_reg_tile_size=None, default_threshold=None, size_heuristics=[], required_types=[], all_sizes={}, user_sizes={}): if command_queue is None: self.ctx = get_prefered_context(interactive, platform_pref, device_pref) self.queue = cl.CommandQueue(self.ctx) else: self.ctx = command_queue.context self.queue = command_queue self.device = self.queue.device self.platform = self.device.platform self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) device_type = self.device.type check_types(self, required_types) max_group_size = int(self.device.max_work_group_size) max_tile_size = int(np.sqrt(self.device.max_work_group_size)) self.max_group_size = max_group_size self.max_tile_size = max_tile_size self.max_threshold = 0 self.max_num_groups = 0 self.max_local_memory = int(self.device.local_mem_size) # Futhark reserves 4 bytes of local memory for its own purposes. self.max_local_memory -= 4 # See comment in rts/c/opencl.h. if self.platform.name.find('NVIDIA CUDA') >= 0: self.max_local_memory -= 12 elif self.platform.name.find('AMD') >= 0: self.max_local_memory -= 16 self.free_list = {} self.global_failure = self.pool.allocate(np.int32().itemsize) cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) self.global_failure_args = self.pool.allocate( np.int64().itemsize * (self.global_failure_args_max + 1)) self.failure_is_an_option = np.int32(0) if 'default_group_size' in sizes: default_group_size = sizes['default_group_size'] del sizes['default_group_size'] if 'default_num_groups' in sizes: default_num_groups = sizes['default_num_groups'] del sizes['default_num_groups'] if 'default_tile_size' in sizes: default_tile_size = sizes['default_tile_size'] del sizes['default_tile_size'] if 'default_reg_tile_size' in sizes: default_reg_tile_size = sizes['default_reg_tile_size'] del sizes['default_reg_tile_size'] if 'default_threshold' in sizes: default_threshold = sizes['default_threshold'] del sizes['default_threshold'] default_group_size_set = default_group_size != None default_tile_size_set = default_tile_size != None default_sizes = apply_size_heuristics( self, size_heuristics, { 'group_size': default_group_size, 'tile_size': default_tile_size, 'reg_tile_size': default_reg_tile_size, 'num_groups': default_num_groups, 'lockstep_width': None, 'threshold': default_threshold }) default_group_size = default_sizes['group_size'] default_num_groups = default_sizes['num_groups'] default_threshold = default_sizes['threshold'] default_tile_size = default_sizes['tile_size'] default_reg_tile_size = default_sizes['reg_tile_size'] lockstep_width = default_sizes['lockstep_width'] if default_group_size > max_group_size: if default_group_size_set: sys.stderr.write( 'Note: Device limits group size to {} (down from {})\n'.format( max_tile_size, default_group_size)) default_group_size = max_group_size if default_tile_size > max_tile_size: if default_tile_size_set: sys.stderr.write( 'Note: Device limits tile size to {} (down from {})\n'.format( max_tile_size, default_tile_size)) default_tile_size = max_tile_size for (k, v) in user_sizes.items(): if k in all_sizes: all_sizes[k]['value'] = v else: raise Exception('Unknown size: {}\nKnown sizes: {}'.format( k, ' '.join(all_sizes.keys()))) self.sizes = {} for (k, v) in all_sizes.items(): if v['class'] == 'group_size': max_value = max_group_size default_value = default_group_size elif v['class'] == 'num_groups': max_value = max_group_size # Intentional! default_value = default_num_groups elif v['class'] == 'tile_size': max_value = max_tile_size default_value = default_tile_size elif v['class'] == 'reg_tile_size': max_value = None default_value = default_reg_tile_size elif v['class'].startswith('threshold'): max_value = None default_value = default_threshold else: # Bespoke sizes have no limit or default. max_value = None if v['value'] == None: self.sizes[k] = default_value elif max_value != None and v['value'] > max_value: sys.stderr.write( 'Note: Device limits {} to {} (down from {}\n'.format( k, max_value, v['value'])) self.sizes[k] = max_value else: self.sizes[k] = v['value'] # XXX: we perform only a subset of z-encoding here. Really, the # compiler should provide us with the variables to which # parameters are mapped. if (len(program_src) >= 0): return cl.Program( self.ctx, program_src ).build(["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] + [ "-D{}={}".format( s.replace('z', 'zz').replace('.', 'zi').replace('#', 'zh'), v) for (s, v) in self.sizes.items() ])
def test_overwrite_efb(): cl.enqueue_fill_buffer(queue, cl_empty_buffer, zero, 0, zero_buffer.nbytes).wait()
def Solve(self, t, dt): # start = timer() cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0, self.LM.nbytes) cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0, self.LM.nbytes) # end = timer() # print('--- Rank: {} time 0: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() calc_Ku_events = [] for iColorGrp in range(len(self.colorGps_buf)): calc_Ku_event = \ self.program.assemble_K_P(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(len(self.mesh.colorGroups[iColorGrp])), np.int64(self.nSmp), np.float64(self.pressure), self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGrp], self.elmTE_buf[iColorGrp], self.u_buf, self.Ku_buf, self.P_buf, wait_for=calc_Ku_events) calc_Ku_events = [calc_Ku_event] # end = timer() # print('--- Rank: {} time 1: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() calc_u_event = \ self.program.calc_u(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(self.nSmp), np.int64(self.lclNDof), np.float64(dt), np.float64(self.damp), self.P_buf, self.Ku_buf, self.LM_buf, self.LHS_buf, self.u_buf, self.up_buf, self.ures_buf, wait_for=[calc_Ku_event]) # calc_u_event.wait() # TODO:: Comment off after debugging # end = timer() # print('--- Rank: {} time 2: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() ures_copy_event = cl.enqueue_copy(self.queue, self.srcURes[:self.lclNCommDof], self.ures_buf, wait_for=[calc_u_event]) # ures_copy_event.wait() # end = timer() # print('--- Rank: {} time 3: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() # Synchronize the ures. self.SyncCommNodes(self.srcURes) # end = timer() # print('--- Rank: {} time 4: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() # Apply boundary condition. self.ApplyBoundaryCondition(self.srcURes) # Enforce the applied boundary condition back to GPU. <lclNSpecialHeadDof> update_u_event = cl.enqueue_copy( self.queue, self.ures_buf, self.srcURes[:self.lclNSpecialHeadDof]) # Add on the global force. appTrac_copy_event = cl.enqueue_copy(self.queue, self.appTrac_buf, self.appTrac) calc_u_event = \ self.program.calc_u_appTrac(self.queue, (self.globalWorkSize,), (self.localWorkSize,), np.int64(self.nSmp), np.int64(self.lclNDof), np.float64(dt), self.LHS_buf, self.appTrac_buf, self.ures_buf, wait_for=[update_u_event, appTrac_copy_event]) calc_u_event.wait() # end = timer() # print('--- Rank: {} time 5: {:10.1f} ms'.format(self.rank, (end - start) * 1000.0)) # start = timer() # Update/Shift the pointers. self.srcURes, self.srcU, self.srcUP = self.srcUP, self.srcURes, self.srcU self.ures_buf, self.u_buf, self.up_buf = self.up_buf, self.ures_buf, self.u_buf
prg = cl.Program( ctx, """ __kernel void sum(__global float *a_g, __global unsigned int *b_g) { int tid = get_local_id(0); a_g[tid] = tid; } """).build() # run once, just to make sure the buffer copied to gpu q.finish() start = time.time() prg.sum(q, (N, ), (N, ), a_gpu, b_gpu) cl.enqueue_fill_buffer(q, a_gpu, np.float32(0), 0, 8 * 4) q.finish() end = time.time() print('kernel done') print('diff', end - start) a_doubled = np.empty_like(a) cl.enqueue_copy(q, a_doubled, a_gpu) b_doubled = np.empty_like(b, dtype=np.uint32) cl.enqueue_copy(q, b_doubled, b_gpu) print('a', a) print('a_doubled', a_doubled) print('b_doubled', b_doubled)
def test_compute_bounds(cl_env, kernels, coord_dtype): ctx, cq = cl_env coords = np.array([[ 0.0, 1.0, 3.0], [ 4.0, 1.0, 8.0], [-4.0,-6.0, 3.0], [-5.0, 0.0,-1.0]], dtype=coord_dtype) radii = np.ones(len(coords), dtype=coord_dtype) leaf = len(coords) - 1 nodes = np.array([(-1, 3, [leaf+0, 1]), ( 0, 3, [leaf+3, 2]), ( 1, 2, [leaf+1, leaf+2]), ( 0, 0, [2, -1]), ( 2, 1, [0, -1]), ( 2, 2, [1, -1]), ( 1, 3, [3, -1])], dtype=Node) coords_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize ) (coords_map, _) = cl.enqueue_map_buffer( cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True ) coords_map[..., :3] = coords del coords_map radii_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii ) nodes_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=nodes ) bounds_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, len(nodes) * 4 * 2 * coords.dtype.itemsize ) flags_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, len(nodes) * np.dtype('uint32').itemsize ) clear_flags = cl.enqueue_fill_buffer( cq, flags_buf, np.zeros(1, dtype='uint32'), 0, len(nodes) * np.dtype('uint32').itemsize ) calc_leaf_bounds = kernels['leafBounds']( cq, (roundUp(len(coords), 32),), None, bounds_buf, coords_buf, radii_buf, nodes_buf, len(coords), ) calc_bounds = kernels['internalBounds']( cq, (roundUp(len(coords), 32),), None, bounds_buf, flags_buf, nodes_buf, len(coords), wait_for=[calc_leaf_bounds, clear_flags] ) (bounds_map, _) = cl.enqueue_map_buffer( cq, bounds_buf, cl.map_flags.READ, 0, (len(nodes), 2, 4), coord_dtype, wait_for=[calc_bounds], is_blocking=True ) expected = np.array([[[-6.0,-7.0,-2.0], [ 5.0, 2.0, 9.0]], [[-6.0,-1.0,-2.0], [ 5.0, 2.0, 9.0]], [[-1.0, 0.0, 2.0], [ 5.0, 2.0, 9.0]], [[-5.0,-7.0, 2.0], [-3.0,-5.0, 4.0]], [[-1.0, 0.0, 2.0], [ 1.0, 2.0, 4.0]], [[ 3.0, 0.0, 7.0], [ 5.0, 2.0, 9.0]], [[-6.0,-1.0,-2.0], [-4.0, 1.0, 0.0]]], dtype=coord_dtype) np.testing.assert_equal(bounds_map[:, :, :3], expected)
def test_traverse(cl_env, kernels, coord_dtype): ctx, cq = cl_env coords = np.array([[ 0.0, 1.0, 3.0], [ 0.0, 1.0, 3.0], [ 4.0, 1.0, 8.0], [-4.0,-6.0, 3.0], [-5.0, 0.0,-1.0], [-5.0, 0.5,-0.5]], dtype=coord_dtype) radii = np.ones(len(coords), dtype=coord_dtype) n_nodes = len(coords) * 2 - 1 coords_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize ) (coords_map, _) = cl.enqueue_map_buffer( cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True ) coords_map[..., :3] = coords del coords_map range_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=np.array([coords.min(axis=0), coords.max(axis=0)], dtype=coords.dtype) ) codes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, len(coords) * np.dtype('uint32').itemsize ) radii_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii ) nodes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, n_nodes * Node.itemsize ) bounds_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, n_nodes * 4 * 2 * coords.dtype.itemsize ) flags_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, n_nodes * np.dtype('uint32').itemsize ) n_collisions = 2 collisions_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, n_collisions * 2 * np.dtype('uint32').itemsize ) n_collisions_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, np.dtype('uint32').itemsize ) calc_codes = kernels['calculateCodes']( cq, (roundUp(len(coords), 32),), None, codes_buf, coords_buf, range_buf, len(coords), ) # Would use radix sort (codes_map, _) = cl.enqueue_map_buffer( cq, codes_buf, cl.map_flags.READ | cl.map_flags.WRITE, 0, (len(coords),), np.dtype('uint32'), wait_for=[calc_codes], is_blocking=True ) order = np.argsort(codes_map, kind='mergesort').astype('uint32') codes_map[...] = codes_map[order] del codes_map ids_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=order ) fill_internal = kernels['fillInternal']( cq, (roundUp(len(coords), 32),), None, nodes_buf, ids_buf, len(coords), ) generate_bvh = kernels['generateBVH']( cq, (roundUp(len(coords)-1, 32),), None, codes_buf, nodes_buf, len(coords), wait_for=[calc_codes, fill_internal] ) clear_flags = cl.enqueue_fill_buffer( cq, flags_buf, np.zeros(1, dtype='uint32'), 0, n_nodes * np.dtype('uint32').itemsize ) calc_bounds = kernels['leafBounds']( cq, (roundUp(len(coords), 32),), None, bounds_buf, coords_buf, radii_buf, nodes_buf, len(coords), wait_for=[generate_bvh] ) calc_bounds = kernels['internalBounds']( cq, (roundUp(len(coords), 32),), None, bounds_buf, flags_buf, nodes_buf, len(coords), wait_for=[clear_flags, calc_bounds] ) clear_collisions = cl.enqueue_fill_buffer( cq, collisions_buf, np.array([-1], dtype='uint32'), 0, n_collisions * 2 * np.dtype('uint32').itemsize ) clear_n_collisions = cl.enqueue_fill_buffer( cq, n_collisions_buf, np.zeros(1, dtype='uint32'), 0, np.dtype('uint32').itemsize ) find_collisions = kernels['traverse']( cq, (roundUp(len(coords), 32),), None, collisions_buf, n_collisions_buf, n_collisions, nodes_buf, bounds_buf, len(coords), wait_for=[clear_collisions, clear_n_collisions, calc_bounds], ) (n_collisions_map, _) = cl.enqueue_map_buffer( cq, n_collisions_buf, cl.map_flags.READ, 0, 1, np.dtype('uint32'), wait_for=[find_collisions], is_blocking=True ) assert n_collisions_map[0] == n_collisions (collisions_map, _) = cl.enqueue_map_buffer( cq, collisions_buf, cl.map_flags.READ, 0, (n_collisions, 2), np.dtype('uint32'), wait_for=[find_collisions], is_blocking=True ) expected = {(0, 1), (4, 5)} assert set(map(tuple, collisions_map)) == expected