def test_pinv_complex128(self): a = np.asarray(np.random.rand(8, 4) + \ 1j*np.random.rand(8, 4), np.complex128) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float64)
def test_pinv_float64(self): a = np.asarray(np.random.rand(8, 4), np.float64) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float64)
def test_pinv_float32(self): a = np.asarray(np.random.rand(8, 4), np.float32) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float32)
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0): """ IAF time decoding machine. Decode a finite length signal encoded with an Integrate-and-Fire neuron. Parameters ---------- s : ndarray of floats Encoded signal. The values represent the time between spikes (in s). dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b : float Encoder bias. d : float Encoder threshold. R : float Neuron resistance. C : float Neuron capacitance. Returns ------- u_rec : ndarray of floats Recovered signal. """ N = len(s) float_type = s.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') # Prepare kernels: compute_ts_mod = \ SourceModule(compute_ts_template.substitute(use_double=use_double)) compute_ts = \ compute_ts_mod.get_function('compute_ts') compute_tsh_mod = \ SourceModule(compute_tsh_template.substitute(use_double=use_double)) compute_tsh = \ compute_tsh_mod.get_function('compute_tsh') compute_q_mod = \ SourceModule(compute_q_template.substitute(use_double=use_double)) compute_q_ideal = \ compute_q_mod.get_function('compute_q_ideal') compute_q_leaky = \ compute_q_mod.get_function('compute_q_leaky') compute_G_mod = \ SourceModule(compute_G_template.substitute(use_double=use_double, cols=(N-1)), options=['-I', install_headers]) compute_G_ideal = compute_G_mod.get_function('compute_G_ideal') compute_G_leaky = compute_G_mod.get_function('compute_G_leaky') compute_u_mod = \ SourceModule(compute_u_template.substitute(use_double=use_double), options=["-I", install_headers]) compute_u = compute_u_mod.get_function('compute_u') # Load data into device memory: s_gpu = gpuarray.to_gpu(s) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.empty(N, float_type) tsh_gpu = gpuarray.empty(N - 1, float_type) q_gpu = gpuarray.empty((N - 1, 1), complex_type) G_gpu = gpuarray.empty((N - 1, N - 1), complex_type) # Get required block/grid sizes for constructing ts, tsh, and q; # use a smaller block size than the maximum to prevent the kernels # from using too many registers: dev = cumisc.get_current_device() max_threads_per_block = 128 block_dim_s, grid_dim_s = \ cumisc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block) # Get required block/grid sizes for constructing G: block_dim_G, grid_dim_G = \ cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block) # Run the kernels: compute_ts(s_gpu, ts_gpu, np.uint32(N), block=block_dim_s, grid=grid_dim_s) compute_tsh(ts_gpu, tsh_gpu, np.uint32(N - 1), block=block_dim_s, grid=grid_dim_s) if np.isinf(R): compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d), float_type(C), np.uint32(N - 1), block=block_dim_s, grid=grid_dim_s) compute_G_ideal(ts_gpu, tsh_gpu, G_gpu, float_type(bw), np.uint32((N - 1)**2), block=block_dim_G, grid=grid_dim_G) else: compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d), float_type(R), float_type(C), np.uint32(N - 1), block=block_dim_s, grid=grid_dim_s) compute_G_leaky(ts_gpu, tsh_gpu, G_gpu, float_type(bw), float_type(R), float_type(C), np.uint32((N - 1)**2), block=block_dim_G, grid=grid_dim_G) # Free unneeded s and ts to provide more memory to the pinv computation: del s_gpu, ts_gpu # Compute the reconstruction coefficients: c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu) # Free unneeded G, G_inv and q: del G_gpu, q_gpu # Allocate array for reconstructed signal: Nt = int(np.ceil(dur / dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes for constructing u: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u(u_rec_gpu, c_gpu, tsh_gpu, float_type(bw), float_type(dt), np.uint32(Nt), np.uint32(N - 1), block=block_dim_t, grid=grid_dim_t) u_rec = u_rec_gpu.get() return np.real(u_rec)
def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu, C_gpu): """ Multiple-input single-output IAF time decoding machine. Decode a signal encoded with an ensemble of Integrate-and-Fire neurons assuming that the encoded signal is representable in terms of sinc kernels. Parameters ---------- s_gpu : pycuda.gpuarray.GPUArray Signal encoded by an ensemble of encoders. The nonzero values represent the time between spikes (in s). The number of arrays in the list corresponds to the number of encoders in the ensemble. ns_gpu : pycuda.gpuarray.GPUArray Number of interspike intervals in each row of `s_gpu`. dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b_gpu : pycuda.gpuarray.GPUArray Array of encoder biases. d_gpu : pycuda.gpuarray.GPUArray Array of encoder thresholds. R_gpu : pycuda.gpuarray.GPUArray Array of neuron resistances. C_gpu : pycuda.gpuarray.GPUArray Array of neuron capacitances. Returns ------- u_rec : pycuda.gpuarray.GPUArray Recovered signal. Notes ----- The number of spikes contributed by each neuron may differ from the number contributed by other neurons. """ # Sanity checks: float_type = s_gpu.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') # Number of spike trains: N = s_gpu.shape[0] if not N: raise ValueError('no spike data given') if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \ (R_gpu.size != N) or (C_gpu.size != N): raise ValueError('parameter arrays must be of same length') # Map CUDA index to neuron index and interspike interval index: ns = ns_gpu.get() idx_to_ni, idx_to_k = _compute_idx_map(ns) idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni) idx_to_k_gpu = gpuarray.to_gpu(idx_to_k) # Get required block/grid sizes; use a smaller block size than the # maximum to prevent the kernels from using too many registers: dev = cumisc.get_current_device() max_threads_per_block = 128 # Prepare kernels: cache_dir = None compute_q_pop_mod = \ SourceModule(compute_q_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_ideal_pop = \ compute_q_pop_mod.get_function('compute_q_ideal') compute_q_leaky_pop = \ compute_q_pop_mod.get_function('compute_q_leaky') compute_ts_pop_mod = \ SourceModule(compute_ts_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_ts_pop = \ compute_ts_pop_mod.get_function('compute_ts') compute_tsh_pop_mod = \ SourceModule(compute_tsh_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_tsh_pop = \ compute_tsh_pop_mod.get_function('compute_tsh') compute_G_pop_mod = \ SourceModule(compute_G_pop_template.substitute(use_double=use_double), options=['-I', install_headers]) compute_G_ideal_pop = \ compute_G_pop_mod.get_function('compute_G_ideal') compute_G_leaky_pop = \ compute_G_pop_mod.get_function('compute_G_leaky') compute_u_pop_mod = \ SourceModule(compute_u_pop_template.substitute(use_double=use_double), options=['-I', install_headers]) compute_u_pop = \ compute_u_pop_mod.get_function('compute_u') # Total number of interspike intervals per neuron less 1 for each # spike train with more than 1 interspike interval: Nq = int(np.sum(ns) - np.sum(ns > 1)) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.zeros_like(s_gpu) tsh_gpu = gpuarray.zeros_like(s_gpu) # Note that these arrays are complex to enable use of CUBLAS # matrix multiplication functions: q_gpu = gpuarray.empty((Nq, 1), complex_type) G_gpu = gpuarray.empty((Nq, Nq), complex_type) # Get required block/grid sizes: block_dim_ts, grid_dim_ts = \ cumisc.select_block_grid_sizes(dev, N, max_threads_per_block) block_dim_q, grid_dim_q = \ cumisc.select_block_grid_sizes(dev, q_gpu.shape, max_threads_per_block) block_dim_G, grid_dim_G = \ cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block) # Launch kernels: compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_ts, grid=grid_dim_ts) compute_tsh_pop(ts_gpu, ns_gpu, tsh_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_q, grid=grid_dim_q) if np.all(np.isinf(R_gpu.get())): compute_q_ideal_pop(s_gpu, q_gpu, b_gpu, d_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_G_ideal_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw), idx_to_ni_gpu, idx_to_k_gpu, np.uint32(Nq), np.uint32(s_gpu.shape[1]), np.uint32(G_gpu.size), block=block_dim_G, grid=grid_dim_G) else: compute_q_leaky_pop(s_gpu, q_gpu, b_gpu, d_gpu, R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_G_leaky_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw), R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(Nq), np.uint32(s_gpu.shape[1]), np.uint32(G_gpu.size), block=block_dim_G, grid=grid_dim_G) # Free unneeded variables: del ts_gpu, idx_to_k_gpu # Compute the reconstruction coefficients: c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu) # Free G, G_inv, and q: del G_gpu, q_gpu # Allocate arrays needed for reconstruction: Nt = int(np.ceil(dur / dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes for constructing u: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u_pop(u_rec_gpu, c_gpu, tsh_gpu, ns_gpu, float_type(bw), float_type(dt), np.uint32(s_gpu.shape[1]), np.uint32(N), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) u_rec = u_rec_gpu.get() return np.real(u_rec)
import pycuda.gpuarray as gpuarray import numpy as np import scikits.cuda.linalg as culinalg import scikits.cuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string import scikits.cuda.cula as cula demo_types = [np.float32, np.complex64] if cula._libcula_toolkit == 'premium' and \ cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing pinv for type ' + str(np.dtype(t)) a = np.asarray((np.random.rand(50, 50) - 0.5) / 10, t) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = culinalg.pinv(a_gpu) print 'Success status: ', np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=1e-2) print 'Maximum error: ', np.max( np.abs(np.linalg.pinv(a) - a_inv_gpu.get())) print ''
def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu, C_gpu, M=5, smoothing=0.0): """ Population IAF time decoding machine. Decode a signal encoded with an ensemble of Integrate-and-Fire neurons assuming that the encoded signal is representable in terms of trigonometric polynomials. Parameters ---------- s_gpu : pycuda.gpuarray.GPUArray Signal encoded by an ensemble of encoders. The nonzero values represent the time between spikes (in s). The number of arrays in the list corresponds to the number of encoders in the ensemble. ns_gpu : pycuda.gpuarray.GPUArray Number of interspike intervals in each row of `s_gpu`. dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b_gpu : pycuda.gpuarray.GPUArray Array of encoder biases. d_gpu : pycuda.gpuarray.GPUArray Array of encoder thresholds. R_gpu : pycuda.gpuarray.GPUArray Array of neuron resistances. C_gpu : pycuda.gpuarray.GPUArray Array of neuron capacitances. M : int 2*M+1 coefficients are used for reconstructing the signal. smoothing : float Smoothing parameter. Returns ------- u_rec : pycuda.gpuarray.GPUArray Recovered signal. Notes ----- The number of spikes contributed by each neuron may differ from the number contributed by other neurons. """ # Sanity checks: float_type = s_gpu.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') N = s_gpu.shape[0] if not N: raise ValueError('no spike data given') if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \ (R_gpu.size != N) or (C_gpu.size != N): raise ValueError('parameter arrays must be of same length') T = 2*np.pi*M/bw if T < dur: raise ValueError('2*pi*M/bw must exceed the signal length') # Map CUDA index to neuron index and interspike interval index: ns = ns_gpu.get() idx_to_ni, idx_to_k = _compute_idx_map(ns) idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni) idx_to_k_gpu = gpuarray.to_gpu(idx_to_k) dev = cumisc.get_current_device() # Use a smaller block size than the maximum to prevent the kernels # from using too many registers: max_threads_per_block = 256 # Prepare kernels: cache_dir = None compute_ts_pop_mod = SourceModule(compute_ts_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts') compute_q_pop_mod = \ SourceModule(compute_q_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal') compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky') compute_F_pop_mod = \ SourceModule(compute_F_pop_template.substitute(use_double=use_double), cache_dir=cache_dir, options=['-I', install_headers]) compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal') compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky') compute_u_pop_mod = \ SourceModule(compute_u_pop_template.substitute(use_double=use_double), cache_dir=cache_dir, options=['-I', install_headers]) compute_u_pop = compute_u_pop_mod.get_function('compute_u') # Total number of interspike intervals per neuron less 1 for each # spike train with more than Nq = int(np.sum(ns)-np.sum(ns>1)) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.zeros_like(s_gpu) # Note that these arrays are complex to enable use of CUBLAS # matrix multiplication functions: q_gpu = gpuarray.empty((Nq, 1), complex_type) F_gpu = gpuarray.empty((Nq, 2*M+1), complex_type) # Get required block/grid sizes: block_dim_ts, grid_dim_ts = \ cumisc.select_block_grid_sizes(dev, N, max_threads_per_block) block_dim_q, grid_dim_q = \ cumisc.select_block_grid_sizes(dev, q_gpu.shape, max_threads_per_block) block_dim_F, grid_dim_F = \ cumisc.select_block_grid_sizes(dev, F_gpu.shape, max_threads_per_block) # Launch kernels: compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_ts, grid=grid_dim_ts) if np.all(np.isinf(R_gpu.get())): compute_q_pop_ideal(s_gpu, q_gpu, b_gpu, d_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_F_pop_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw), idx_to_ni_gpu, idx_to_k_gpu, np.int32(M), np.uint32(s_gpu.shape[1]), np.uint32(F_gpu.size), block=block_dim_F, grid=grid_dim_F) else: compute_q_pop_leaky(s_gpu, q_gpu, b_gpu, d_gpu, R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_F_pop_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw), R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.int32(M), np.uint32(s_gpu.shape[1]), np.uint32(F_gpu.size), block=block_dim_F, grid=grid_dim_F) # Free unneeded variables: del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu # Compute the product of F^H and q first so that both F^H and q # can be dropped from memory: FH_gpu = culinalg.hermitian(F_gpu) FHq_gpu = culinalg.dot(FH_gpu, q_gpu) del FH_gpu, q_gpu if smoothing == 0: c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__), FHq_gpu) else: c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+ np.sum(ns)*smoothing*culinalg.eye(2*M+1, float_type), __pinv_rcond__), FHq_gpu) # Allocate array for reconstructed signal: Nt = int(np.ceil(dur/dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u_pop(u_rec_gpu, c_gpu, float_type(bw), float_type(dt), np.int32(M), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) return np.real(u_rec_gpu.get())
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0): """ IAF time decoding machine. Decode a finite length signal encoded with an Integrate-and-Fire neuron. Parameters ---------- s : ndarray of floats Encoded signal. The values represent the time between spikes (in s). dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b : float Encoder bias. d : float Encoder threshold. R : float Neuron resistance. C : float Neuron capacitance. M : int 2*M+1 coefficients are used for reconstructing the signal. smoothing : float Smoothing parameter. Returns ------- u_rec : ndarray of floats Recovered signal. """ N = len(s) float_type = s.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') T = 2*np.pi*M/bw if T < dur: raise ValueError('2*pi*M/bw must exceed the signal length') dev = cumisc.get_current_device() # Prepare kernels: cache_dir = None compute_q_mod = \ SourceModule(compute_q_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_ideal = compute_q_mod.get_function('compute_q_ideal') compute_q_leaky = compute_q_mod.get_function('compute_q_leaky') compute_F_mod = \ SourceModule(compute_F_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_F_ideal = compute_F_mod.get_function('compute_F_ideal') compute_F_leaky = compute_F_mod.get_function('compute_F_leaky') compute_u_mod = \ SourceModule(compute_u_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_u = compute_u_mod.get_function('compute_u') # Load data into GPU memory: s_gpu = gpuarray.to_gpu(s) # XXX: Eventually replace this with a PyCUDA equivalent ts = np.cumsum(s) ts_gpu = gpuarray.to_gpu(ts) # Set up GPUArrays for intermediary data. Note that all of the # arrays are complex to facilitate use of CUBLAS matrix # multiplication functions: q_gpu = gpuarray.empty((N-1, 1), complex_type) F_gpu = gpuarray.empty((N-1, 2*M+1), complex_type) # Get required block/grid sizes; use a smaller block size than the # maximum to prevent the kernels from using too many registers: max_threads_per_block = 256 block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes(dev, q_gpu.shape, max_threads_per_block) block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes(dev, F_gpu.shape, max_threads_per_block) if np.isinf(R): compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d), float_type(C), np.uint32(N-1), block=block_dim_s, grid=grid_dim_s) compute_F_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw), np.int32(M), np.uint32((N-1)*(2*M+1)), block=block_dim_F, grid=grid_dim_F) else: compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d), float_type(R), float_type(C), np.uint32(N-1), block=block_dim_s, grid=grid_dim_s) compute_F_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw), float_type(R), float_type(C), np.int32(M), np.uint32((N-1)*(2*M+1)), block=block_dim_F, grid=grid_dim_F) # Compute the product of F^H and q first so that q # can be dropped from memory: FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c') del q_gpu if smoothing == 0: c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__), FHq_gpu) else: c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+ (N-1)*smoothing*culinalg.eye(2*M+1, float_type), __pinv_rcond__), FHq_gpu) # Allocate array for reconstructed signal: Nt = int(np.ceil(dur/dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u(u_rec_gpu, c_gpu, float_type(bw), float_type(dt), np.int32(M), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) return np.real(u_rec_gpu.get())
""" import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import scikits.cuda.linalg as culinalg import scikits.cuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string import scikits.cuda.cula as cula demo_types = [np.float32, np.complex64] if cula._libcula_toolkit == "premium" and cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print "Testing pinv for type " + str(np.dtype(t)) a = np.asarray((np.random.rand(50, 50) - 0.5) / 10, t) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = culinalg.pinv(a_gpu) print "Success status: ", np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=1e-2) print "Maximum error: ", np.max(np.abs(np.linalg.pinv(a) - a_inv_gpu.get())) print ""
def pinv(a, rcond=1e-15): return linalg.pinv(gpuarray.to_gpu(a), rcond).get()
def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu, C_gpu, M=5, smoothing=0.0): """ Population IAF time decoding machine. Decode a signal encoded with an ensemble of Integrate-and-Fire neurons assuming that the encoded signal is representable in terms of trigonometric polynomials. Parameters ---------- s_gpu : pycuda.gpuarray.GPUArray Signal encoded by an ensemble of encoders. The nonzero values represent the time between spikes (in s). The number of arrays in the list corresponds to the number of encoders in the ensemble. ns_gpu : pycuda.gpuarray.GPUArray Number of interspike intervals in each row of `s_gpu`. dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b_gpu : pycuda.gpuarray.GPUArray Array of encoder biases. d_gpu : pycuda.gpuarray.GPUArray Array of encoder thresholds. R_gpu : pycuda.gpuarray.GPUArray Array of neuron resistances. C_gpu : pycuda.gpuarray.GPUArray Array of neuron capacitances. M : int 2*M+1 coefficients are used for reconstructing the signal. smoothing : float Smoothing parameter. Returns ------- u_rec : pycuda.gpuarray.GPUArray Recovered signal. Notes ----- The number of spikes contributed by each neuron may differ from the number contributed by other neurons. """ # Sanity checks: float_type = s_gpu.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') N = s_gpu.shape[0] if not N: raise ValueError('no spike data given') if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \ (R_gpu.size != N) or (C_gpu.size != N): raise ValueError('parameter arrays must be of same length') T = 2 * np.pi * M / bw if T < dur: raise ValueError('2*pi*M/bw must exceed the signal length') # Map CUDA index to neuron index and interspike interval index: ns = ns_gpu.get() idx_to_ni, idx_to_k = _compute_idx_map(ns) idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni) idx_to_k_gpu = gpuarray.to_gpu(idx_to_k) dev = cumisc.get_current_device() # Use a smaller block size than the maximum to prevent the kernels # from using too many registers: max_threads_per_block = 256 # Prepare kernels: cache_dir = None compute_ts_pop_mod = SourceModule( compute_ts_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts') compute_q_pop_mod = \ SourceModule(compute_q_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal') compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky') compute_F_pop_mod = \ SourceModule(compute_F_pop_template.substitute(use_double=use_double), cache_dir=cache_dir, options=['-I', install_headers]) compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal') compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky') compute_u_pop_mod = \ SourceModule(compute_u_pop_template.substitute(use_double=use_double), cache_dir=cache_dir, options=['-I', install_headers]) compute_u_pop = compute_u_pop_mod.get_function('compute_u') # Total number of interspike intervals per neuron less 1 for each # spike train with more than Nq = int(np.sum(ns) - np.sum(ns > 1)) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.zeros_like(s_gpu) # Note that these arrays are complex to enable use of CUBLAS # matrix multiplication functions: q_gpu = gpuarray.empty((Nq, 1), complex_type) F_gpu = gpuarray.empty((Nq, 2 * M + 1), complex_type) # Get required block/grid sizes: block_dim_ts, grid_dim_ts = \ cumisc.select_block_grid_sizes(dev, N, max_threads_per_block) block_dim_q, grid_dim_q = \ cumisc.select_block_grid_sizes(dev, q_gpu.shape, max_threads_per_block) block_dim_F, grid_dim_F = \ cumisc.select_block_grid_sizes(dev, F_gpu.shape, max_threads_per_block) # Launch kernels: compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_ts, grid=grid_dim_ts) if np.all(np.isinf(R_gpu.get())): compute_q_pop_ideal(s_gpu, q_gpu, b_gpu, d_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_F_pop_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw), idx_to_ni_gpu, idx_to_k_gpu, np.int32(M), np.uint32(s_gpu.shape[1]), np.uint32(F_gpu.size), block=block_dim_F, grid=grid_dim_F) else: compute_q_pop_leaky(s_gpu, q_gpu, b_gpu, d_gpu, R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_F_pop_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw), R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.int32(M), np.uint32(s_gpu.shape[1]), np.uint32(F_gpu.size), block=block_dim_F, grid=grid_dim_F) # Free unneeded variables: del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu # Compute the product of F^H and q first so that both F^H and q # can be dropped from memory: FH_gpu = culinalg.hermitian(F_gpu) FHq_gpu = culinalg.dot(FH_gpu, q_gpu) del FH_gpu, q_gpu if smoothing == 0: c_gpu = culinalg.dot( culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__), FHq_gpu) else: c_gpu = culinalg.dot( culinalg.pinv( culinalg.dot(F_gpu, F_gpu, 'c') + np.sum(ns) * smoothing * culinalg.eye(2 * M + 1, float_type), __pinv_rcond__), FHq_gpu) # Allocate array for reconstructed signal: Nt = int(np.ceil(dur / dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u_pop(u_rec_gpu, c_gpu, float_type(bw), float_type(dt), np.int32(M), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) return np.real(u_rec_gpu.get())
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0): """ IAF time decoding machine. Decode a finite length signal encoded with an Integrate-and-Fire neuron. Parameters ---------- s : ndarray of floats Encoded signal. The values represent the time between spikes (in s). dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b : float Encoder bias. d : float Encoder threshold. R : float Neuron resistance. C : float Neuron capacitance. M : int 2*M+1 coefficients are used for reconstructing the signal. smoothing : float Smoothing parameter. Returns ------- u_rec : ndarray of floats Recovered signal. """ N = len(s) float_type = s.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') T = 2 * np.pi * M / bw if T < dur: raise ValueError('2*pi*M/bw must exceed the signal length') dev = cumisc.get_current_device() # Prepare kernels: cache_dir = None compute_q_mod = \ SourceModule(compute_q_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_ideal = compute_q_mod.get_function('compute_q_ideal') compute_q_leaky = compute_q_mod.get_function('compute_q_leaky') compute_F_mod = \ SourceModule(compute_F_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_F_ideal = compute_F_mod.get_function('compute_F_ideal') compute_F_leaky = compute_F_mod.get_function('compute_F_leaky') compute_u_mod = \ SourceModule(compute_u_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_u = compute_u_mod.get_function('compute_u') # Load data into GPU memory: s_gpu = gpuarray.to_gpu(s) # XXX: Eventually replace this with a PyCUDA equivalent ts = np.cumsum(s) ts_gpu = gpuarray.to_gpu(ts) # Set up GPUArrays for intermediary data. Note that all of the # arrays are complex to facilitate use of CUBLAS matrix # multiplication functions: q_gpu = gpuarray.empty((N - 1, 1), complex_type) F_gpu = gpuarray.empty((N - 1, 2 * M + 1), complex_type) # Get required block/grid sizes; use a smaller block size than the # maximum to prevent the kernels from using too many registers: max_threads_per_block = 256 block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes( dev, q_gpu.shape, max_threads_per_block) block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes( dev, F_gpu.shape, max_threads_per_block) if np.isinf(R): compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d), float_type(C), np.uint32(N - 1), block=block_dim_s, grid=grid_dim_s) compute_F_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw), np.int32(M), np.uint32((N - 1) * (2 * M + 1)), block=block_dim_F, grid=grid_dim_F) else: compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d), float_type(R), float_type(C), np.uint32(N - 1), block=block_dim_s, grid=grid_dim_s) compute_F_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw), float_type(R), float_type(C), np.int32(M), np.uint32((N - 1) * (2 * M + 1)), block=block_dim_F, grid=grid_dim_F) # Compute the product of F^H and q first so that q # can be dropped from memory: FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c') del q_gpu if smoothing == 0: c_gpu = culinalg.dot( culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__), FHq_gpu) else: c_gpu = culinalg.dot( culinalg.pinv( culinalg.dot(F_gpu, F_gpu, 'c') + (N - 1) * smoothing * culinalg.eye(2 * M + 1, float_type), __pinv_rcond__), FHq_gpu) # Allocate array for reconstructed signal: Nt = int(np.ceil(dur / dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u(u_rec_gpu, c_gpu, float_type(bw), float_type(dt), np.int32(M), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) return np.real(u_rec_gpu.get())
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0): """ IAF time decoding machine. Decode a finite length signal encoded with an Integrate-and-Fire neuron. Parameters ---------- s : ndarray of floats Encoded signal. The values represent the time between spikes (in s). dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b : float Encoder bias. d : float Encoder threshold. R : float Neuron resistance. C : float Neuron capacitance. Returns ------- u_rec : ndarray of floats Recovered signal. """ N = len(s) float_type = s.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') # Prepare kernels: compute_ts_mod = \ SourceModule(compute_ts_template.substitute(use_double=use_double)) compute_ts = \ compute_ts_mod.get_function('compute_ts') compute_tsh_mod = \ SourceModule(compute_tsh_template.substitute(use_double=use_double)) compute_tsh = \ compute_tsh_mod.get_function('compute_tsh') compute_q_mod = \ SourceModule(compute_q_template.substitute(use_double=use_double)) compute_q_ideal = \ compute_q_mod.get_function('compute_q_ideal') compute_q_leaky = \ compute_q_mod.get_function('compute_q_leaky') compute_G_mod = \ SourceModule(compute_G_template.substitute(use_double=use_double, cols=(N-1)), options=['-I', install_headers]) compute_G_ideal = compute_G_mod.get_function('compute_G_ideal') compute_G_leaky = compute_G_mod.get_function('compute_G_leaky') compute_u_mod = \ SourceModule(compute_u_template.substitute(use_double=use_double), options=["-I", install_headers]) compute_u = compute_u_mod.get_function('compute_u') # Load data into device memory: s_gpu = gpuarray.to_gpu(s) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.empty(N, float_type) tsh_gpu = gpuarray.empty(N-1, float_type) q_gpu = gpuarray.empty((N-1, 1), complex_type) G_gpu = gpuarray.empty((N-1, N-1), complex_type) # Get required block/grid sizes for constructing ts, tsh, and q; # use a smaller block size than the maximum to prevent the kernels # from using too many registers: dev = cumisc.get_current_device() max_threads_per_block = 128 block_dim_s, grid_dim_s = \ cumisc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block) # Get required block/grid sizes for constructing G: block_dim_G, grid_dim_G = \ cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block) # Run the kernels: compute_ts(s_gpu, ts_gpu, np.uint32(N), block=block_dim_s, grid=grid_dim_s) compute_tsh(ts_gpu, tsh_gpu, np.uint32(N-1), block=block_dim_s, grid=grid_dim_s) if np.isinf(R): compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d), float_type(C), np.uint32(N-1), block=block_dim_s, grid=grid_dim_s) compute_G_ideal(ts_gpu, tsh_gpu, G_gpu, float_type(bw), np.uint32((N-1)**2), block=block_dim_G, grid=grid_dim_G) else: compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d), float_type(R), float_type(C), np.uint32(N-1), block=block_dim_s, grid=grid_dim_s) compute_G_leaky(ts_gpu, tsh_gpu, G_gpu, float_type(bw), float_type(R), float_type(C), np.uint32((N-1)**2), block=block_dim_G, grid=grid_dim_G) # Free unneeded s and ts to provide more memory to the pinv computation: del s_gpu, ts_gpu # Compute the reconstruction coefficients: c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu) # Free unneeded G, G_inv and q: del G_gpu, q_gpu # Allocate array for reconstructed signal: Nt = int(np.ceil(dur/dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros in pycuda 2011.1.2 is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes for constructing u: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u(u_rec_gpu, c_gpu, tsh_gpu, float_type(bw), float_type(dt), np.uint32(Nt), np.uint32(N-1), block=block_dim_t, grid=grid_dim_t) u_rec = u_rec_gpu.get() return np.real(u_rec)
def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu, C_gpu): """ Multiple-input single-output IAF time decoding machine. Decode a signal encoded with an ensemble of Integrate-and-Fire neurons assuming that the encoded signal is representable in terms of sinc kernels. Parameters ---------- s_gpu : pycuda.gpuarray.GPUArray Signal encoded by an ensemble of encoders. The nonzero values represent the time between spikes (in s). The number of arrays in the list corresponds to the number of encoders in the ensemble. ns_gpu : pycuda.gpuarray.GPUArray Number of interspike intervals in each row of `s_gpu`. dur : float Duration of signal (in s). dt : float Sampling resolution of original signal; the sampling frequency is 1/dt Hz. bw : float Signal bandwidth (in rad/s). b_gpu : pycuda.gpuarray.GPUArray Array of encoder biases. d_gpu : pycuda.gpuarray.GPUArray Array of encoder thresholds. R_gpu : pycuda.gpuarray.GPUArray Array of neuron resistances. C_gpu : pycuda.gpuarray.GPUArray Array of neuron capacitances. Returns ------- u_rec : pycuda.gpuarray.GPUArray Recovered signal. Notes ----- The number of spikes contributed by each neuron may differ from the number contributed by other neurons. """ # Sanity checks: float_type = s_gpu.dtype.type if float_type == np.float32: use_double = 0 complex_type = np.complex64 __pinv_rcond__ = 1e-4 elif float_type == np.float64: use_double = 1 complex_type = np.complex128 __pinv_rcond__ = 1e-8 else: raise ValueError('unsupported data type') # Number of spike trains: N = s_gpu.shape[0] if not N: raise ValueError('no spike data given') if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \ (R_gpu.size != N) or (C_gpu.size != N): raise ValueError('parameter arrays must be of same length') # Map CUDA index to neuron index and interspike interval index: ns = ns_gpu.get() idx_to_ni, idx_to_k = _compute_idx_map(ns) idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni) idx_to_k_gpu = gpuarray.to_gpu(idx_to_k) # Get required block/grid sizes; use a smaller block size than the # maximum to prevent the kernels from using too many registers: dev = cumisc.get_current_device() max_threads_per_block = 128 # Prepare kernels: cache_dir = None compute_q_pop_mod = \ SourceModule(compute_q_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_q_ideal_pop = \ compute_q_pop_mod.get_function('compute_q_ideal') compute_q_leaky_pop = \ compute_q_pop_mod.get_function('compute_q_leaky') compute_ts_pop_mod = \ SourceModule(compute_ts_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_ts_pop = \ compute_ts_pop_mod.get_function('compute_ts') compute_tsh_pop_mod = \ SourceModule(compute_tsh_pop_template.substitute(use_double=use_double), cache_dir=cache_dir) compute_tsh_pop = \ compute_tsh_pop_mod.get_function('compute_tsh') compute_G_pop_mod = \ SourceModule(compute_G_pop_template.substitute(use_double=use_double), options=['-I', install_headers]) compute_G_ideal_pop = \ compute_G_pop_mod.get_function('compute_G_ideal') compute_G_leaky_pop = \ compute_G_pop_mod.get_function('compute_G_leaky') compute_u_pop_mod = \ SourceModule(compute_u_pop_template.substitute(use_double=use_double), options=['-I', install_headers]) compute_u_pop = \ compute_u_pop_mod.get_function('compute_u') # Total number of interspike intervals per neuron less 1 for each # spike train with more than 1 interspike interval: Nq = int(np.sum(ns)-np.sum(ns>1)) # Set up GPUArrays for intermediary data: ts_gpu = gpuarray.zeros_like(s_gpu) tsh_gpu = gpuarray.zeros_like(s_gpu) # Note that these arrays are complex to enable use of CUBLAS # matrix multiplication functions: q_gpu = gpuarray.empty((Nq, 1), complex_type) G_gpu = gpuarray.empty((Nq, Nq), complex_type) # Get required block/grid sizes: block_dim_ts, grid_dim_ts = \ cumisc.select_block_grid_sizes(dev, N, max_threads_per_block) block_dim_q, grid_dim_q = \ cumisc.select_block_grid_sizes(dev, q_gpu.shape, max_threads_per_block) block_dim_G, grid_dim_G = \ cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block) # Launch kernels: compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_ts, grid=grid_dim_ts) compute_tsh_pop(ts_gpu, ns_gpu, tsh_gpu, np.uint32(s_gpu.shape[1]), np.uint32(N), block=block_dim_q, grid=grid_dim_q) if np.all(np.isinf(R_gpu.get())): compute_q_ideal_pop(s_gpu, q_gpu, b_gpu, d_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_G_ideal_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw), idx_to_ni_gpu, idx_to_k_gpu, np.uint32(Nq), np.uint32(s_gpu.shape[1]), np.uint32(G_gpu.size), block=block_dim_G, grid=grid_dim_G) else: compute_q_leaky_pop(s_gpu, q_gpu, b_gpu, d_gpu, R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(s_gpu.shape[1]), np.uint32(Nq), block=block_dim_q, grid=grid_dim_q) compute_G_leaky_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw), R_gpu, C_gpu, idx_to_ni_gpu, idx_to_k_gpu, np.uint32(Nq), np.uint32(s_gpu.shape[1]), np.uint32(G_gpu.size), block=block_dim_G, grid=grid_dim_G) # Free unneeded variables: del ts_gpu, idx_to_k_gpu # Compute the reconstruction coefficients: c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu) # Free G, G_inv, and q: del G_gpu, q_gpu # Allocate arrays needed for reconstruction: Nt = int(np.ceil(dur/dt)) u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type)) ### Replace the above with the following line when the bug in # gpuarray.zeros is fixed: #u_rec_gpu = gpuarray.zeros(Nt, complex_type) # Get required block/grid sizes for constructing u: block_dim_t, grid_dim_t = \ cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block) # Reconstruct signal: compute_u_pop(u_rec_gpu, c_gpu, tsh_gpu, ns_gpu, float_type(bw), float_type(dt), np.uint32(s_gpu.shape[1]), np.uint32(N), np.uint32(Nt), block=block_dim_t, grid=grid_dim_t) u_rec = u_rec_gpu.get() return np.real(u_rec)