def assemble(self, pts, obs_tris, src_tris): gpu_pts = gpu.to_gpu(pts, self.float_type) gpu_src_tris = gpu.to_gpu(src_tris, np.int32) n = obs_tris.shape[0] out = np.empty((n, 3, 3, src_tris.shape[0], 3, 3), dtype=self.float_type) def call_integrator(start_idx, end_idx): n_items = end_idx - start_idx gpu_result = gpu.empty_gpu( (n_items, 3, 3, src_tris.shape[0], 3, 3), self.float_type) gpu_obs_tris = gpu.to_gpu(obs_tris[start_idx:end_idx], np.int32) self.integrator(gpu_result, np.int32(self.q[0].shape[0]), self.gpu_qx, self.gpu_qw, gpu_pts, np.int32(n_items), gpu_obs_tris, np.int32(src_tris.shape[0]), gpu_src_tris, self.gpu_params, grid=(n_items, src_tris.shape[0], 1), block=(1, 1, 1)) out[start_idx:end_idx] = gpu_result.get() call_size = 1024 for I in gpu.intervals(n, call_size): call_integrator(*I) return out
def __init__(self, obs_pts, obs_ns, src_mesh, K_name, nq, params, float_type): self.shape = (obs_pts.shape[0] * 3, src_mesh[1].shape[0] * 9) self.dim = obs_pts.shape[1] self.tensor_dim = kernels[K_name].tensor_dim self.n_obs = obs_pts.shape[0] self.n_src = src_mesh[1].shape[0] in_size = self.n_src * self.dim * self.tensor_dim out_size = self.n_obs * self.tensor_dim self.gpu_in = gpu.empty_gpu(in_size, float_type) self.gpu_out = gpu.empty_gpu(out_size, float_type) self.q = gauss2d_tri(nq) self.gpu_obs_pts = gpu.to_gpu(obs_pts, float_type) self.gpu_obs_ns = gpu.to_gpu(obs_ns, float_type) self.gpu_src_pts = gpu.to_gpu(src_mesh[0], float_type) self.gpu_src_tris = gpu.to_gpu(src_mesh[1], np.int32) self.gpu_params = gpu.to_gpu(np.array(params), float_type) self.block_size = 128 self.n_blocks = int(np.ceil(self.n_obs / self.block_size)) self.module = gpu.load_gpu('matrix_free.cl', tmpl_args=dict( block_size=self.block_size, float_type=gpu.np_to_c_type(float_type), quad_pts=self.q[0], quad_wts=self.q[1])) self.fnc = getattr(self.module, "farfield_tris_to_pts" + K_name)
def __init__(self, kernel, params, float_type, nq_far, nq_near, pts, tris): self.float_type = float_type self.module = get_gpu_module(kernel, float_type) self.gpu_params = gpu.to_gpu(np.array(params), self.float_type) self.gpu_near_q = self.quad_to_gpu(gauss4d_tri(nq_near, nq_near)) self.gpu_far_q = self.quad_to_gpu(gauss4d_tri(nq_far, nq_far)) self.gpu_pts = gpu.to_gpu(pts, self.float_type) self.gpu_tris = gpu.to_gpu(tris, np.int32)
def __init__(self, kernel, params, n_q, float_type): self.float_type = float_type self.integrator = getattr(get_gpu_module(kernel, float_type), "farfield_tris") self.q = gauss4d_tri(n_q, n_q) self.gpu_qx = gpu.to_gpu(self.q[0], float_type) self.gpu_qw = gpu.to_gpu(self.q[1], float_type) self.gpu_params = gpu.to_gpu(np.array(params), float_type)
def pairs_quad(self, integrator, q, pairs_list): gpu_pairs_list = gpu.to_gpu(pairs_list.copy(), np.int32) n = pairs_list.shape[0] if n == 0: return np.empty((0,3,3,3,3), dtype = self.float_type) call_size = 2 ** 17 result = np.empty((n, 3, 3, 3, 3), dtype = self.float_type) def call_integrator(start_idx, end_idx): n_pairs = (end_idx - start_idx) n_threads = int(np.ceil(n_pairs / block_size)) gpu_result = gpu.empty_gpu((n_pairs, 3, 3, 3, 3), self.float_type) integrator( gpu_result, np.int32(q[0].shape[0]), q[0], q[1], self.gpu_pts, self.gpu_tris, gpu_pairs_list, np.int32(start_idx), np.int32(end_idx), self.gpu_params, grid = (n_threads, 1, 1), block = (block_size, 1, 1) ) result[start_idx:end_idx] = gpu_result.get() for I in gpu.intervals(n, call_size): call_integrator(*I) return result
def build_vertex_mat(self, pairs, quad): block_size = 128 gpu_cfg = dict(block_size=block_size, float_type=gpu.np_to_c_type(self.float_type)) module = gpu.load_gpu('interior_corners.cl', tmpl_args=gpu_cfg, no_caching=True) n_pairs = pairs.shape[0] gpu_result = gpu.zeros_gpu((n_pairs, 3, 3, 3), self.float_type) gpu_pairs = gpu.to_gpu(pairs.copy(), np.int32) n_threads = int(np.ceil(n_pairs / block_size)) if n_pairs != 0: module.interior_corners(gpu_result, np.int32(quad[0].shape[0]), quad[0], quad[1], self.farfield.gpu_obs_pts, self.farfield.gpu_obs_ns, self.farfield.gpu_src_pts, self.farfield.gpu_src_tris, gpu_pairs, np.int32(0), np.int32(n_pairs), self.farfield.gpu_params, grid=(n_threads, 1, 1), block=(block_size, 1, 1)) return make_pairs_mat(pairs, gpu_result.get(), self.farfield.shape)
def interior_pairs_quad(K_name, pairs_list, gpu_quad, gpu_obs_pts, gpu_obs_ns, gpu_src_pts, gpu_src_tris, gpu_params, float_type, finite_part): n_pairs = pairs_list.shape[0] gpu_result = gpu.zeros_gpu((n_pairs, 3, 3, 3), float_type) gpu_pairs_list = gpu.to_gpu(pairs_list.copy(), np.int32) module = get_gpu_module(K_name, float_type) n_threads = int(np.ceil(n_pairs / block_size)) if n_pairs != 0: module.interior_pairs(gpu_result, np.int32(gpu_quad[0].shape[0]), gpu_quad[0], gpu_quad[1], gpu_obs_pts, gpu_obs_ns, gpu_src_pts, gpu_src_tris, gpu_pairs_list, np.int32(0), np.int32(n_pairs), gpu_params, np.int32(1 if finite_part else 0), grid=(n_threads, 1, 1), block=(block_size, 1, 1)) return gpu_result.get()
def test_async_get(): R = np.random.rand(10) gpu_R = gpu.to_gpu(R, np.float32) async def f(w): return await gpu.get(w, gpu_R) R2 = taskloaf.run(f) np.testing.assert_almost_equal(R, R2)
def farfield_pts_direct(K, obs_pts, obs_ns, src_pts, src_ns, vec, params, float_type): module = get_gpu_module(float_type) fnc = getattr(module, "farfield_pts" + K) n_obs, dim = obs_pts.shape n_src = src_pts.shape[0] tensor_dim = int(vec.shape[0] / n_src) gpu_result = gpu.empty_gpu(n_obs * tensor_dim, float_type) gpu_obs_pts = gpu.to_gpu(obs_pts, float_type) gpu_obs_ns = gpu.to_gpu(obs_ns, float_type) gpu_src_pts = gpu.to_gpu(src_pts, float_type) gpu_src_ns = gpu.to_gpu(src_ns, float_type) gpu_vec = gpu.to_gpu(vec, float_type) gpu_params = gpu.to_gpu(np.array(params), float_type) n_blocks = int(np.ceil(n_obs / block_size)) fnc(gpu_result, gpu_obs_pts, gpu_obs_ns, gpu_src_pts, gpu_src_ns, gpu_vec, gpu_params, np.int32(n_obs), np.int32(n_src), grid=(n_blocks, 1, 1), block=(block_size, 1, 1)) return gpu_result.get()
async def gpu_run(): # gd = tsk.get_service('gpu_data') # if 'add' not in gd: # gd['add'] = (fnc, arg, gpu_R) # else: # fnc, arg, gpu_R = gd['add'] module = load_module() fnc = module.add R = np.random.rand(10000000) gpu_R = gpu.to_gpu(R) gpu_out = gpu.empty_gpu(gpu_R.shape) fnc(gpu_out, gpu_R, grid=(gpu_R.shape[0], 1, 1), block=(1, 1, 1)) R2 = await gpu.get(gpu_out) gpu.logger.debug('run')
def call_integrator(start_idx, end_idx): n_items = end_idx - start_idx gpu_result = gpu.empty_gpu( (n_items, 3, 3, src_tris.shape[0], 3, 3), self.float_type) gpu_obs_tris = gpu.to_gpu(obs_tris[start_idx:end_idx], np.int32) self.integrator(gpu_result, np.int32(self.q[0].shape[0]), self.gpu_qx, self.gpu_qw, gpu_pts, np.int32(n_items), gpu_obs_tris, np.int32(src_tris.shape[0]), gpu_src_tris, self.gpu_params, grid=(n_items, src_tris.shape[0], 1), block=(1, 1, 1)) out[start_idx:end_idx] = gpu_result.get()
def test_simple_module(): n = 10 in_arr = np.random.rand(n) arg = 1.0 this_dir = os.path.dirname(os.path.realpath(__file__)) modules = [ gpu.load_gpu('kernel.cl', tmpl_dir=this_dir, tmpl_args=dict(arg=arg)), gpu.load_gpu_from_code(open(os.path.join(this_dir, 'kernel.cl')).read(), tmpl_args=dict(arg=arg)) ] for m in modules: fnc = m.add in_gpu = gpu.to_gpu(in_arr, np.float32) out_gpu = gpu.empty_gpu(n, np.float32) fnc(out_gpu, in_gpu, grid=(n, 1, 1), block=(1, 1, 1)) output = out_gpu.get() correct = in_arr + arg np.testing.assert_almost_equal(correct, output)
def int_gpu(self, arr): return gpu.to_gpu(arr, np.int32)
def float_gpu(self, arr): return gpu.to_gpu(arr, self.cfg.float_type)
def quad_to_gpu(self, q): return [gpu.to_gpu(arr, self.float_type) for arr in q]