Пример #1
0
def make_kernel(delays, n_thread_per_block, n_inner):
    horizon = next_pow_of_2(delays.max() + 1)
    cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {})
    cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / delays.shape[0]})
    n_thread_per_block = int32(n_thread_per_block)
    n_inner = int32(n_inner)
    dcf = cu_delay_cfun(horizon, cfpre, cfpost, 1, n_thread_per_block)

    @cuda.jit
    def kernel(step, state, update, buf, dt, omega, cvars, weights, delays,
               a_values, s_values, Z):
        i_t = cuda.threadIdx.x
        i_thread = cuda.blockIdx.x * cuda.blockDim.x + i_t
        aff = cuda.shared.array((1, 1, 1, n_thread_per_block), float32)
        a = a_values[i_thread]
        s = math.sqrt(dt) * math.sqrt(2.0 * s_values[i_thread])
        sqrt_dt = math.sqrt(dt)
        for i_step in range(n_inner):
            for i_post in range(weights.shape[0]):
                dcf(aff, delays, weights, state, i_post, i_thread, step[0],
                    cvars, buf)
                update[i_post, i_thread] = dt * (omega + a * aff[0, 0, 0, i_t]) \
   + s * Z[i_step, i_post, i_thread]
            for i_post in range(weights.shape[0]):
                state[0, i_post, 0, i_thread] += update[i_post, i_thread]
            if i_thread == 0:
                step[0] += 1
            cuda.syncthreads()

    return horizon, kernel
Пример #2
0
    def _do_for_params(self, horizon):

        # setup test data
        numpy.random.seed(42)
        n_step, n_node, n_cvar, n_svar, n_thread = 100, 5, 2, 4, self.n_thread
        cvars = numpy.random.randint(0, n_svar, n_cvar).astype(numpy.int32)
        out = numpy.zeros((n_step, n_node, n_cvar, n_thread), numpy.float32)
        delays = numpy.random.randint(0, horizon - 2,
                                      (n_node, n_node)).astype(numpy.int32)
        weights = numpy.random.randn(n_node, n_node).astype(numpy.float32)
        weights[numpy.random.rand(*weights.shape) < 0.25] = 0.0
        state = numpy.random.randn(n_step, n_node, n_svar,
                                   n_thread).astype(numpy.float32)
        buf = numpy.zeros((n_node, horizon, n_cvar, n_thread), numpy.float32)
        # debugging
        delayed_step = numpy.zeros_like(delays)

        # setup cu functions
        pre = cu_linear_cfe_pre(0.0, 1.0, 0.0)
        post = cu_linear_cfe_post(1.0, 0.0)
        dcf = cu_delay_cfun(horizon,
                            pre,
                            post,
                            n_cvar,
                            self.block_dim[0],
                            step_stride=1,
                            aff_node_stride=1)

        # run it
        @self.jit_and_run(out, delays, weights, state, cvars,
                          buf)  #,delayed_step)
        def kernel(out, delays, weights, state, cvars, buf):  #, delayed_step):
            i_thread = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
            for step in range(state.shape[0]):
                for i_post in range(state.shape[1]):
                    dcf(out, delays, weights, state, i_post, i_thread, step,
                        cvars, buf)  #,delayed_step)

        # ensure buffer is updating correctly
        buf_state = numpy.roll(state[:, :, cvars][-horizon:].transpose(
            (1, 0, 2, 3)),
                               n_step,
                               axis=1)
        numpy.testing.assert_allclose(buf, buf_state)

        # ensure buffer time indexing is correct
        # numpy.testing.assert_equal(delayed_step, (n_step - 1 - delays + horizon) % horizon)

        # replay
        nodes = numpy.tile(numpy.r_[:n_node], (n_node, 1))
        for step in range(horizon + 3, n_step):
            delayed_state = state[:, :, cvars][
                step - delays, nodes]  # (n_node, n_node, n_cvar, n_thread)
            afferent = (weights.reshape(
                (n_node, n_node, 1, 1)) * delayed_state).sum(
                    axis=1)  # (n_node, n_cvar, n_thread)
            numpy.testing.assert_allclose(afferent, out[step], 1e-5, 1e-6)
Пример #3
0
    def test_kuramoto(self):

        # build & run Python simulations
        numpy.random.seed(42)
        n = 5

        weights = numpy.zeros((n, n), numpy.float32)
        idelays = numpy.zeros((n, n), numpy.int32)
        for i in range(n - 1):
            idelays[i, i + 1] = i + 1
            weights[i, i + 1] = i + 1

        def gen_sim(a):
            dt = 0.1
            conn = connectivity.Connectivity()
            conn.weights = weights
            conn.tract_lengths = idelays * dt
            conn.speed = 1.0
            sim = simulator.Simulator(
                coupling=py_coupling.Kuramoto(a=a),
                connectivity=conn,
                model=models.Kuramoto(omega=100 * 2 * numpy.pi / 1e3),
                monitors=monitors.Raw(),
                integrator=integrators.EulerDeterministic(dt=dt))
            sim.configure()
            sim.history[:] = 0.1
            return sim

        a_values = numpy.r_[:self.n_thread].astype(numpy.float32)
        sims = [gen_sim(a) for a in a_values]

        py_data = []
        py_coupling0 = []
        for sim in sims:
            ys = []
            cs = []
            for (t, y), in sim(simulation_length=10.0):
                ys.append(y[0, :, 0])
                # cs.append(sim.model._coupling_0[:, 0])
            py_data.append(numpy.array(ys))
            # py_coupling0.append(numpy.array(cs))
        py_data = numpy.array(py_data)
        # py_coupling0 = numpy.array(py_coupling0)

        # build CUDA kernels
        cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {})
        cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / n})
        horiz2 = next_pow_of_2(sims[0].horizon)
        dcf = cu_delay_cfun(horiz2,
                            cfpre,
                            cfpost,
                            1,
                            self.block_dim[0],
                            aff_node_stride=1)

        # build kernel
        dt = numba.float32(sims[0].integrator.dt)
        omega = numba.float32(sims[0].model.omega[0])
        cvars = numpy.array([0], numpy.int32)
        weights = sims[0].connectivity.weights.astype(numpy.float32)
        delays = sims[0].connectivity.idelays.astype(numpy.int32)

        @cuda.jit
        def kernel(step, state, coupling, aff, buf, dt, omega, cvars, weights,
                   delays, a_values):
            i_thread = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
            a = a_values[i_thread]
            for i_post in range(weights.shape[0]):
                dcf(aff, delays, weights, state, i_post, i_thread, step[0],
                    cvars, buf)
                coupling[i_post, i_thread] = a * aff[0, i_post, 0, i_thread]
                state[0, i_post, 0,
                      i_thread] += dt * (omega +
                                         a * aff[0, i_post, 0, i_thread])

        step = numpy.array([0], numpy.int32)
        state = (numpy.zeros(
            (1, n, 1, self.n_thread)) + 0.1).astype(numpy.float32)
        coupling0 = numpy.zeros((n, self.n_thread), numpy.float32)
        aff = numpy.zeros((1, n, 1, self.n_thread), numpy.float32)
        buf = numpy.zeros((n, horiz2, 1, self.n_thread), numpy.float32)
        buf += 0.1

        cu_data = numpy.zeros(py_data.shape, numpy.float32)
        cu_coupling0 = numpy.zeros((cu_data.shape[1], ) + coupling0.shape)
        for step_ in range(cu_data.shape[1]):
            step[0] = step_
            kernel[self.block_dim,
                   self.grid_dim](step, state, coupling0, aff, buf, dt, omega,
                                  cvars, weights, delays, a_values)
            cu_data[:, step_] = state[0, :, 0].T
            cu_coupling0[step_] = coupling0

        # accept higher error because it accumulates over time
        # TODO test error proportional to time
        numpy.testing.assert_allclose(cu_data, py_data, 1e-2, 1e-2)