def _do_for_params(self, horizon): # setup test data numpy.random.seed(42) n_step, n_node, n_cvar, n_svar, n_thread = 100, 5, 2, 4, self.n_thread cvars = numpy.random.randint(0, n_svar, n_cvar).astype(numpy.int32) out = numpy.zeros((n_step, n_node, n_cvar, n_thread), numpy.float32) delays = numpy.random.randint(0, horizon - 2, (n_node, n_node)).astype(numpy.int32) weights = numpy.random.randn(n_node, n_node).astype(numpy.float32) weights[numpy.random.rand(*weights.shape) < 0.25] = 0.0 state = numpy.random.randn(n_step, n_node, n_svar, n_thread).astype(numpy.float32) buf = numpy.zeros((n_node, horizon, n_cvar, n_thread), numpy.float32) # debugging delayed_step = numpy.zeros_like(delays) # setup cu functions pre = cu_linear_cfe_pre(0.0, 1.0, 0.0) post = cu_linear_cfe_post(1.0, 0.0) dcf = cu_delay_cfun(horizon, pre, post, n_cvar, self.block_dim[0], step_stride=1, aff_node_stride=1) # run it @self.jit_and_run(out, delays, weights, state, cvars, buf) #,delayed_step) def kernel(out, delays, weights, state, cvars, buf): #, delayed_step): i_thread = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x for step in range(state.shape[0]): for i_post in range(state.shape[1]): dcf(out, delays, weights, state, i_post, i_thread, step, cvars, buf) #,delayed_step) # ensure buffer is updating correctly buf_state = numpy.roll(state[:, :, cvars][-horizon:].transpose( (1, 0, 2, 3)), n_step, axis=1) numpy.testing.assert_allclose(buf, buf_state) # ensure buffer time indexing is correct # numpy.testing.assert_equal(delayed_step, (n_step - 1 - delays + horizon) % horizon) # replay nodes = numpy.tile(numpy.r_[:n_node], (n_node, 1)) for step in range(horizon + 3, n_step): delayed_state = state[:, :, cvars][ step - delays, nodes] # (n_node, n_node, n_cvar, n_thread) afferent = (weights.reshape( (n_node, n_node, 1, 1)) * delayed_state).sum( axis=1) # (n_node, n_cvar, n_thread) numpy.testing.assert_allclose(afferent, out[step], 1e-5, 1e-6)
def test_linear_pre(self): ai, aj, intercept = -0.2, 0.3, 0.25 out = numpy.zeros((self.n_thread,), 'f') xj, xi = numpy.random.rand(2, self.n_thread).astype('f') pre = cu_linear_cfe_pre(ai, aj, intercept) @self.jit_and_run(out, xi, xj) def kernel(out, xi, xj): t = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x out[t] = pre(xi[t], xj[t]) numpy.testing.assert_allclose(out, ai * xi + aj * xj + intercept, 1e-4, 1e-5)