def make_kernel(delays, n_thread_per_block, n_inner): horizon = next_pow_of_2(delays.max() + 1) cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {}) cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / delays.shape[0]}) n_thread_per_block = int32(n_thread_per_block) n_inner = int32(n_inner) dcf = cu_delay_cfun(horizon, cfpre, cfpost, 1, n_thread_per_block) @cuda.jit def kernel(step, state, update, buf, dt, omega, cvars, weights, delays, a_values, s_values, Z): i_t = cuda.threadIdx.x i_thread = cuda.blockIdx.x * cuda.blockDim.x + i_t aff = cuda.shared.array((1, 1, 1, n_thread_per_block), float32) a = a_values[i_thread] s = math.sqrt(dt) * math.sqrt(2.0 * s_values[i_thread]) sqrt_dt = math.sqrt(dt) for i_step in range(n_inner): for i_post in range(weights.shape[0]): dcf(aff, delays, weights, state, i_post, i_thread, step[0], cvars, buf) update[i_post, i_thread] = dt * (omega + a * aff[0, 0, 0, i_t]) \ + s * Z[i_step, i_post, i_thread] for i_post in range(weights.shape[0]): state[0, i_post, 0, i_thread] += update[i_post, i_thread] if i_thread == 0: step[0] += 1 cuda.syncthreads() return horizon, kernel
def _do_for_params(self, horizon): # setup test data numpy.random.seed(42) n_step, n_node, n_cvar, n_svar, n_thread = 100, 5, 2, 4, self.n_thread cvars = numpy.random.randint(0, n_svar, n_cvar).astype(numpy.int32) out = numpy.zeros((n_step, n_node, n_cvar, n_thread), numpy.float32) delays = numpy.random.randint(0, horizon - 2, (n_node, n_node)).astype(numpy.int32) weights = numpy.random.randn(n_node, n_node).astype(numpy.float32) weights[numpy.random.rand(*weights.shape) < 0.25] = 0.0 state = numpy.random.randn(n_step, n_node, n_svar, n_thread).astype(numpy.float32) buf = numpy.zeros((n_node, horizon, n_cvar, n_thread), numpy.float32) # debugging delayed_step = numpy.zeros_like(delays) # setup cu functions pre = cu_linear_cfe_pre(0.0, 1.0, 0.0) post = cu_linear_cfe_post(1.0, 0.0) dcf = cu_delay_cfun(horizon, pre, post, n_cvar, self.block_dim[0], step_stride=1, aff_node_stride=1) # run it @self.jit_and_run(out, delays, weights, state, cvars, buf) #,delayed_step) def kernel(out, delays, weights, state, cvars, buf): #, delayed_step): i_thread = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x for step in range(state.shape[0]): for i_post in range(state.shape[1]): dcf(out, delays, weights, state, i_post, i_thread, step, cvars, buf) #,delayed_step) # ensure buffer is updating correctly buf_state = numpy.roll(state[:, :, cvars][-horizon:].transpose( (1, 0, 2, 3)), n_step, axis=1) numpy.testing.assert_allclose(buf, buf_state) # ensure buffer time indexing is correct # numpy.testing.assert_equal(delayed_step, (n_step - 1 - delays + horizon) % horizon) # replay nodes = numpy.tile(numpy.r_[:n_node], (n_node, 1)) for step in range(horizon + 3, n_step): delayed_state = state[:, :, cvars][ step - delays, nodes] # (n_node, n_node, n_cvar, n_thread) afferent = (weights.reshape( (n_node, n_node, 1, 1)) * delayed_state).sum( axis=1) # (n_node, n_cvar, n_thread) numpy.testing.assert_allclose(afferent, out[step], 1e-5, 1e-6)
def test_kuramoto(self): # build & run Python simulations numpy.random.seed(42) n = 5 weights = numpy.zeros((n, n), numpy.float32) idelays = numpy.zeros((n, n), numpy.int32) for i in range(n - 1): idelays[i, i + 1] = i + 1 weights[i, i + 1] = i + 1 def gen_sim(a): dt = 0.1 conn = connectivity.Connectivity() conn.weights = weights conn.tract_lengths = idelays * dt conn.speed = 1.0 sim = simulator.Simulator( coupling=py_coupling.Kuramoto(a=a), connectivity=conn, model=models.Kuramoto(omega=100 * 2 * numpy.pi / 1e3), monitors=monitors.Raw(), integrator=integrators.EulerDeterministic(dt=dt)) sim.configure() sim.history[:] = 0.1 return sim a_values = numpy.r_[:self.n_thread].astype(numpy.float32) sims = [gen_sim(a) for a in a_values] py_data = [] py_coupling0 = [] for sim in sims: ys = [] cs = [] for (t, y), in sim(simulation_length=10.0): ys.append(y[0, :, 0]) # cs.append(sim.model._coupling_0[:, 0]) py_data.append(numpy.array(ys)) # py_coupling0.append(numpy.array(cs)) py_data = numpy.array(py_data) # py_coupling0 = numpy.array(py_coupling0) # build CUDA kernels cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {}) cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / n}) horiz2 = next_pow_of_2(sims[0].horizon) dcf = cu_delay_cfun(horiz2, cfpre, cfpost, 1, self.block_dim[0], aff_node_stride=1) # build kernel dt = numba.float32(sims[0].integrator.dt) omega = numba.float32(sims[0].model.omega[0]) cvars = numpy.array([0], numpy.int32) weights = sims[0].connectivity.weights.astype(numpy.float32) delays = sims[0].connectivity.idelays.astype(numpy.int32) @cuda.jit def kernel(step, state, coupling, aff, buf, dt, omega, cvars, weights, delays, a_values): i_thread = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x a = a_values[i_thread] for i_post in range(weights.shape[0]): dcf(aff, delays, weights, state, i_post, i_thread, step[0], cvars, buf) coupling[i_post, i_thread] = a * aff[0, i_post, 0, i_thread] state[0, i_post, 0, i_thread] += dt * (omega + a * aff[0, i_post, 0, i_thread]) step = numpy.array([0], numpy.int32) state = (numpy.zeros( (1, n, 1, self.n_thread)) + 0.1).astype(numpy.float32) coupling0 = numpy.zeros((n, self.n_thread), numpy.float32) aff = numpy.zeros((1, n, 1, self.n_thread), numpy.float32) buf = numpy.zeros((n, horiz2, 1, self.n_thread), numpy.float32) buf += 0.1 cu_data = numpy.zeros(py_data.shape, numpy.float32) cu_coupling0 = numpy.zeros((cu_data.shape[1], ) + coupling0.shape) for step_ in range(cu_data.shape[1]): step[0] = step_ kernel[self.block_dim, self.grid_dim](step, state, coupling0, aff, buf, dt, omega, cvars, weights, delays, a_values) cu_data[:, step_] = state[0, :, 0].T cu_coupling0[step_] = coupling0 # accept higher error because it accumulates over time # TODO test error proportional to time numpy.testing.assert_allclose(cu_data, py_data, 1e-2, 1e-2)