Python get_current_device 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scikits.cuda.misc

메소드/함수: get_current_device

hotexamples.com에서의 예제들: 12

Python get_current_device - 12개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scikits.cuda.misc.get_current_device에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: iaf_cuda.py 프로젝트: era1erushe/ted.python

def iaf_encode_pop(u_gpu,
                   dt,
                   b_gpu,
                   d_gpu,
                   R_gpu,
                   C_gpu,
                   y_gpu=None,
                   interval_gpu=None,
                   quad_method='trapz',
                   full_output=False):
    """
    Population IAF time encoding machine.

    Encode a finite length signal with a population of Integrate-and-Fire
    Neurons.

    Parameters
    ----------
    u_gpu : pycuda.gpuarray.GPUArray
        Signal to encode.
    dt : float
        Sampling resolution of input signal; the sampling frequency is
        1/dt Hz.
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    y_gpu : pycuda.gpuarray.GPUArray
        Initial values of integrators.
    interval_gpu : pycuda.gpuarray.GPUArray
        Times since last spike (in s) for each neuron.
    quad_method : {'rect', 'trapz'}
        Quadrature method to use (rectangular or trapezoidal) when the
        neuron is ideal; exponential Euler integration is used
        when the neuron is leaky.
    full_output : bool
        If true, the function returns the updated arrays `y_gpu` and
        `interval_gpu` in addition to the the encoded data block.

    Returns
    -------
    [s_gpu, ns_gpu] : list of pycuda.gpuarray.GPUArray
        If `full_output` is false, returns the encoded signal as a
        matrix `s_gpu` whose rows contain the spike times generated by each
        neuron. The number of spike times in each row is returned in
        `ns_gpu`; all other values in `s_gpu` are set to 0.
    [s_gpu, ns_gpu, y_gpu, interval_gpu] : list of pycuda.gpuarray.GPUArray
        If `full_output` is true, returns the encoded signal
        followed by updated encoder parameters.

    """

    float_type = u_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
    elif float_type == np.float64:
        use_double = 1
    else:
        raise ValueError('unsupported data type')

    # Get the length of the signal:
    Nu = u_gpu.size

    N = b_gpu.size
    if (d_gpu.size != N) or \
           (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')
    if ((y_gpu != None) and (y_gpu.size != N)) or \
       ((interval_gpu != None) and (interval_gpu.size != N)):
        raise ValueError('parameter arrays must be of same length')

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Get required block/grid sizes for running N encoders to process
    # the N signals:
    block_dim, grid_dim = cumisc.select_block_grid_sizes(
        dev, N, max_threads_per_block)

    # Configure kernel:
    cache_dir = None
    iaf_encode_pop_mod = \
                   SourceModule(iaf_encode_pop_template.substitute(use_double=use_double),
                                cache_dir=cache_dir)
    iaf_encode_pop = iaf_encode_pop_mod.get_function("iaf_encode_pop")

    # Initialize integrator variables if necessary:
    if y_gpu == None:
        y_gpu = gpuarray.zeros(N, float_type)
    if interval_gpu == None:
        interval_gpu = gpuarray.zeros(N, float_type)

    # XXX: A very long s array might cause memory problems:
    s_gpu = gpuarray.zeros((N, Nu), float_type)
    ns_gpu = gpuarray.zeros(N, np.uint32)
    iaf_encode_pop(u_gpu,
                   s_gpu,
                   ns_gpu,
                   float_type(dt),
                   b_gpu,
                   d_gpu,
                   R_gpu,
                   C_gpu,
                   y_gpu,
                   interval_gpu,
                   np.uint32(True if quad_method == 'trapz' else False),
                   np.uint32(Nu),
                   np.uint32(N),
                   block=block_dim,
                   grid=grid_dim)

    if full_output:
        return [s_gpu, ns_gpu, y_gpu, interval_gpu]
    else:
        return [s_gpu, ns_gpu]

예제 #2

파일 보기

파일: iaf_cuda.py 프로젝트: era1erushe/ted.python

def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu, C_gpu):
    """
    Multiple-input single-output IAF time decoding machine.

    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of sinc kernels.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.

    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.
    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    # Number of spike trains:
    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')

    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    dev = cumisc.get_current_device()
    max_threads_per_block = 128

    # Prepare kernels:
    cache_dir = None
    compute_q_pop_mod = \
        SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_q_ideal_pop = \
                        compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_leaky_pop = \
                        compute_q_pop_mod.get_function('compute_q_leaky')

    compute_ts_pop_mod = \
        SourceModule(compute_ts_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_ts_pop = \
                   compute_ts_pop_mod.get_function('compute_ts')

    compute_tsh_pop_mod = \
        SourceModule(compute_tsh_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_tsh_pop = \
                    compute_tsh_pop_mod.get_function('compute_tsh')

    compute_G_pop_mod = \
        SourceModule(compute_G_pop_template.substitute(use_double=use_double),
                     options=['-I', install_headers])
    compute_G_ideal_pop = \
                        compute_G_pop_mod.get_function('compute_G_ideal')
    compute_G_leaky_pop = \
                        compute_G_pop_mod.get_function('compute_G_leaky')

    compute_u_pop_mod = \
        SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                     options=['-I', install_headers])
    compute_u_pop = \
                  compute_u_pop_mod.get_function('compute_u')

    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than 1 interspike interval:
    Nq = int(np.sum(ns) - np.sum(ns > 1))

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.zeros_like(s_gpu)
    tsh_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    G_gpu = gpuarray.empty((Nq, Nq), complex_type)

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_G, grid_dim_G = \
                 cumisc.select_block_grid_sizes(dev, G_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu,
                   ns_gpu,
                   ts_gpu,
                   np.uint32(s_gpu.shape[1]),
                   np.uint32(N),
                   block=block_dim_ts,
                   grid=grid_dim_ts)
    compute_tsh_pop(ts_gpu,
                    ns_gpu,
                    tsh_gpu,
                    np.uint32(s_gpu.shape[1]),
                    np.uint32(N),
                    block=block_dim_q,
                    grid=grid_dim_q)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_ideal_pop(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_G_ideal_pop(ts_gpu,
                            tsh_gpu,
                            G_gpu,
                            float_type(bw),
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(Nq),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(G_gpu.size),
                            block=block_dim_G,
                            grid=grid_dim_G)
    else:
        compute_q_leaky_pop(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_G_leaky_pop(ts_gpu,
                            tsh_gpu,
                            G_gpu,
                            float_type(bw),
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(Nq),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(G_gpu.size),
                            block=block_dim_G,
                            grid=grid_dim_G)

    # Free unneeded variables:
    del ts_gpu, idx_to_k_gpu

    # Compute the reconstruction coefficients:
    c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu)

    # Free G, G_inv, and q:
    del G_gpu, q_gpu

    # Allocate arrays needed for reconstruction:
    Nt = int(np.ceil(dur / dt))

    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes for constructing u:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu,
                  c_gpu,
                  tsh_gpu,
                  ns_gpu,
                  float_type(bw),
                  float_type(dt),
                  np.uint32(s_gpu.shape[1]),
                  np.uint32(N),
                  np.uint32(Nt),
                  block=block_dim_t,
                  grid=grid_dim_t)
    u_rec = u_rec_gpu.get()

    return np.real(u_rec)

예제 #3

파일 보기

파일: iaf_cuda.py 프로젝트: era1erushe/ted.python

def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0):
    """
    IAF time decoding machine.

    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.

    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.
    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    # Prepare kernels:
    compute_ts_mod = \
                   SourceModule(compute_ts_template.substitute(use_double=use_double))
    compute_ts = \
               compute_ts_mod.get_function('compute_ts')

    compute_tsh_mod = \
                    SourceModule(compute_tsh_template.substitute(use_double=use_double))
    compute_tsh = \
                compute_tsh_mod.get_function('compute_tsh')

    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double))
    compute_q_ideal = \
                    compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = \
                    compute_q_mod.get_function('compute_q_leaky')

    compute_G_mod = \
                  SourceModule(compute_G_template.substitute(use_double=use_double,
                                                             cols=(N-1)),
                               options=['-I', install_headers])
    compute_G_ideal = compute_G_mod.get_function('compute_G_ideal')
    compute_G_leaky = compute_G_mod.get_function('compute_G_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               options=["-I", install_headers])
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into device memory:
    s_gpu = gpuarray.to_gpu(s)

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.empty(N, float_type)
    tsh_gpu = gpuarray.empty(N - 1, float_type)
    q_gpu = gpuarray.empty((N - 1, 1), complex_type)
    G_gpu = gpuarray.empty((N - 1, N - 1), complex_type)

    # Get required block/grid sizes for constructing ts, tsh, and q;
    # use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    dev = cumisc.get_current_device()
    max_threads_per_block = 128
    block_dim_s, grid_dim_s = \
                 cumisc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block)

    # Get required block/grid sizes for constructing G:
    block_dim_G, grid_dim_G = \
                 cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block)

    # Run the kernels:
    compute_ts(s_gpu, ts_gpu, np.uint32(N), block=block_dim_s, grid=grid_dim_s)
    compute_tsh(ts_gpu,
                tsh_gpu,
                np.uint32(N - 1),
                block=block_dim_s,
                grid=grid_dim_s)
    if np.isinf(R):
        compute_q_ideal(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_G_ideal(ts_gpu,
                        tsh_gpu,
                        G_gpu,
                        float_type(bw),
                        np.uint32((N - 1)**2),
                        block=block_dim_G,
                        grid=grid_dim_G)
    else:
        compute_q_leaky(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(R),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_G_leaky(ts_gpu,
                        tsh_gpu,
                        G_gpu,
                        float_type(bw),
                        float_type(R),
                        float_type(C),
                        np.uint32((N - 1)**2),
                        block=block_dim_G,
                        grid=grid_dim_G)

    # Free unneeded s and ts to provide more memory to the pinv computation:
    del s_gpu, ts_gpu

    # Compute the reconstruction coefficients:
    c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu)

    # Free unneeded G, G_inv and q:
    del G_gpu, q_gpu

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur / dt))

    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes for constructing u:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu,
              c_gpu,
              tsh_gpu,
              float_type(bw),
              float_type(dt),
              np.uint32(Nt),
              np.uint32(N - 1),
              block=block_dim_t,
              grid=grid_dim_t)
    u_rec = u_rec_gpu.get()

    return np.real(u_rec)

예제 #4

파일 보기

파일: iaf_trig_cuda.py 프로젝트: bionet/ted.python

def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu,
                   C_gpu, M=5, smoothing=0.0):
    """
    Population IAF time decoding machine.
    
    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of trigonometric polynomials.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.
        
    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.

    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')
    
    T = 2*np.pi*M/bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')
                    
    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Prepare kernels:
    cache_dir = None
    compute_ts_pop_mod = SourceModule(compute_ts_pop_template.substitute(use_double=use_double),
                                  cache_dir=cache_dir)
    compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts')
    
    compute_q_pop_mod = \
                      SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir)
    compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky')

    compute_F_pop_mod = \
                  SourceModule(compute_F_pop_template.substitute(use_double=use_double),
                               cache_dir=cache_dir,
                               options=['-I', install_headers])
    compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal')
    compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky')

    compute_u_pop_mod = \
                      SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir,
                                   options=['-I', install_headers])
    compute_u_pop = compute_u_pop_mod.get_function('compute_u')
    
    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than
    Nq = int(np.sum(ns)-np.sum(ns>1))
    
    # Set up GPUArrays for intermediary data: 
    ts_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    F_gpu = gpuarray.empty((Nq, 2*M+1), complex_type) 

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_F, grid_dim_F = \
                 cumisc.select_block_grid_sizes(dev, F_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]),
                   np.uint32(N),
                   block=block_dim_ts, grid=grid_dim_ts)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_pop_ideal(s_gpu, q_gpu,
                            b_gpu, d_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_F_pop_ideal(s_gpu, ts_gpu, F_gpu,
                            float_type(bw),
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.int32(M), np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F, grid=grid_dim_F)
    else:
        compute_q_pop_leaky(s_gpu, q_gpu,
                            b_gpu, d_gpu,
                            R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_F_pop_leaky(s_gpu, ts_gpu, F_gpu,
                            float_type(bw), R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.int32(M), np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F, grid=grid_dim_F)

    # Free unneeded variables:
    del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu

    # Compute the product of F^H and q first so that both F^H and q
    # can be dropped from memory:
    FH_gpu = culinalg.hermitian(F_gpu)
    FHq_gpu = culinalg.dot(FH_gpu, q_gpu)
    del FH_gpu, q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'),
                                           __pinv_rcond__), 
                             FHq_gpu)
    else:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+
                                           np.sum(ns)*smoothing*culinalg.eye(2*M+1,
                                                                        float_type),
                                           __pinv_rcond__),   
                             FHq_gpu)
        
    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur/dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu, c_gpu, float_type(bw),
                  float_type(dt),
                  np.int32(M),
                  np.uint32(Nt),
                  block=block_dim_t, grid=grid_dim_t)

    return np.real(u_rec_gpu.get())

예제 #5

파일 보기

파일: iaf_cuda.py 프로젝트: era1erushe/ted.python

def iaf_encode(u,
               dt,
               b,
               d,
               R=np.inf,
               C=1.0,
               dte=0.0,
               y=0.0,
               interval=0.0,
               quad_method='trapz',
               full_output=False):
    """
    IAF time encoding machine.

    Encode a finite length signal with an Integrate-and-Fire neuron.

    Parameters
    ----------
    u : array_like of floats
        Signal to encode.
    dt : float
        Sampling resolution of input signal; the sampling frequency
        is 1/dt Hz.
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    dte : float
        Sampling resolution assumed by the encoder (s).
        This may not exceed `dt`.
    y : float
        Initial value of integrator.
    interval : float
        Time since last spike (in s).
    quad_method : {'rect', 'trapz'}
        Quadrature method to use (rectangular or trapezoidal) when the
        neuron is ideal; exponential Euler integration is used
        when the neuron is leaky.
    full_output : bool
        If set, the function returns the encoded data block followed
        by the given parameters (with updated values for `y` and `interval`).
        This is useful when the function is called repeatedly to
        encode a long signal.

    Returns
    -------
    s : ndarray of floats
        If `full_output` is false, returns the signal encoded as an
        array of interspike intervals.
    [s, dt, b, d, R, C, dte, y, interval, quad_method, full_output] : list
        If `full_output` is true, returns the encoded signal
        followed by updated encoder parameters.

    Notes
    -----
    When trapezoidal integration is used, the value of the integral
    will not be computed for the very last entry in `u`.

    """

    # Input sanity check:
    float_type = u.dtype.type
    if float_type == np.float32:
        use_double = 0
    elif float_type == np.float64:
        use_double = 1
    else:
        raise ValueError('unsupported data type')

    # Handle empty input:
    Nu = len(u)
    if Nu == 0:
        if full_output:
            return array((),float), dt, b, d, R, C, dte, y, interval, \
                   quad_method, full_output
        else:
            return array((), float)

    # Check whether the encoding resolution is finer than that of the
    # original sampled signal:
    if dte > dt:
        raise ValueError(
            'encoding time resolution must not exceeed original signal resolution'
        )
    if dte < 0:
        raise ValueError('encoding time resolution must be nonnegative')
    if dte != 0 and dte != dt:

        # Resample signal and adjust signal length accordingly:
        M = int(dt / dte)
        u = resample(u, len(u) * M)
        Nu *= M
        dt = dte

    dev = cumisc.get_current_device()

    # Configure kernel:
    iaf_encode_mod = \
                   SourceModule(iaf_encode_template.substitute(use_double=use_double))
    iaf_encode = iaf_encode_mod.get_function("iaf_encode")

    # XXX: A very long s array might cause memory problems:
    s = np.zeros(Nu, float_type)
    i_s_0 = np.zeros(1, np.uint32)
    y_0 = np.asarray([y], float_type)
    interval_0 = np.asarray([interval], float_type)
    iaf_encode(drv.In(u),
               drv.Out(s),
               drv.InOut(i_s_0),
               float_type(dt),
               float_type(b),
               float_type(d),
               float_type(R),
               float_type(C),
               drv.InOut(y_0),
               drv.InOut(interval_0),
               np.uint32(True if quad_method == 'trapz' else False),
               np.uint32(Nu),
               block=(1, 1, 1))

    if full_output:
        return s[0:i_s_0[0]], dt, b, d, R, C, y_0[0], interval_0[0], \
               quad_method, full_output
    else:
        return s[0:i_s_0[0]]

예제 #6

파일 보기

def iaf_decode_pop(s_gpu,
                   ns_gpu,
                   dur,
                   dt,
                   bw,
                   b_gpu,
                   d_gpu,
                   R_gpu,
                   C_gpu,
                   M=5,
                   smoothing=0.0):
    """
    Population IAF time decoding machine.
    
    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of trigonometric polynomials.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.
        
    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.

    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')

    T = 2 * np.pi * M / bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Prepare kernels:
    cache_dir = None
    compute_ts_pop_mod = SourceModule(
        compute_ts_pop_template.substitute(use_double=use_double),
        cache_dir=cache_dir)
    compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts')

    compute_q_pop_mod = \
                      SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir)
    compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky')

    compute_F_pop_mod = \
                  SourceModule(compute_F_pop_template.substitute(use_double=use_double),
                               cache_dir=cache_dir,
                               options=['-I', install_headers])
    compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal')
    compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky')

    compute_u_pop_mod = \
                      SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir,
                                   options=['-I', install_headers])
    compute_u_pop = compute_u_pop_mod.get_function('compute_u')

    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than
    Nq = int(np.sum(ns) - np.sum(ns > 1))

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    F_gpu = gpuarray.empty((Nq, 2 * M + 1), complex_type)

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_F, grid_dim_F = \
                 cumisc.select_block_grid_sizes(dev, F_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu,
                   ns_gpu,
                   ts_gpu,
                   np.uint32(s_gpu.shape[1]),
                   np.uint32(N),
                   block=block_dim_ts,
                   grid=grid_dim_ts)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_pop_ideal(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_F_pop_ideal(s_gpu,
                            ts_gpu,
                            F_gpu,
                            float_type(bw),
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.int32(M),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F,
                            grid=grid_dim_F)
    else:
        compute_q_pop_leaky(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_F_pop_leaky(s_gpu,
                            ts_gpu,
                            F_gpu,
                            float_type(bw),
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.int32(M),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F,
                            grid=grid_dim_F)

    # Free unneeded variables:
    del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu

    # Compute the product of F^H and q first so that both F^H and q
    # can be dropped from memory:
    FH_gpu = culinalg.hermitian(F_gpu)
    FHq_gpu = culinalg.dot(FH_gpu, q_gpu)
    del FH_gpu, q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(
            culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__),
            FHq_gpu)
    else:
        c_gpu = culinalg.dot(
            culinalg.pinv(
                culinalg.dot(F_gpu, F_gpu, 'c') +
                np.sum(ns) * smoothing * culinalg.eye(2 * M + 1, float_type),
                __pinv_rcond__), FHq_gpu)

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur / dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu,
                  c_gpu,
                  float_type(bw),
                  float_type(dt),
                  np.int32(M),
                  np.uint32(Nt),
                  block=block_dim_t,
                  grid=grid_dim_t)

    return np.real(u_rec_gpu.get())

예제 #7

파일 보기

파일: iaf_trig_cuda.py 프로젝트: bionet/ted.python

def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0):
    """
    IAF time decoding machine.
    
    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.
        
    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.

    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')
        
    T = 2*np.pi*M/bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    dev = cumisc.get_current_device()
    
    # Prepare kernels:
    cache_dir = None
    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_q_ideal = compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = compute_q_mod.get_function('compute_q_leaky')

    compute_F_mod = \
                  SourceModule(compute_F_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_F_ideal = compute_F_mod.get_function('compute_F_ideal')
    compute_F_leaky = compute_F_mod.get_function('compute_F_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into GPU memory:
    s_gpu = gpuarray.to_gpu(s)

    # XXX: Eventually replace this with a PyCUDA equivalent
    ts = np.cumsum(s)
    ts_gpu = gpuarray.to_gpu(ts)

    # Set up GPUArrays for intermediary data. Note that all of the
    # arrays are complex to facilitate use of CUBLAS matrix
    # multiplication functions:
    q_gpu = gpuarray.empty((N-1, 1), complex_type)
    F_gpu = gpuarray.empty((N-1, 2*M+1), complex_type)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    max_threads_per_block = 256
    block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes(dev,
                                                             q_gpu.shape,
                                                             max_threads_per_block)
    block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes(dev,
                                                             F_gpu.shape,
                                                             max_threads_per_block)
    if np.isinf(R):
        compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d),
                        float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_F_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw),
                        np.int32(M), np.uint32((N-1)*(2*M+1)),
                        block=block_dim_F, grid=grid_dim_F)
    else:
        compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d),
                        float_type(R), float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_F_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw),
                        float_type(R), float_type(C),
                        np.int32(M), np.uint32((N-1)*(2*M+1)),
                        block=block_dim_F, grid=grid_dim_F)

    # Compute the product of F^H and q first so that q
    # can be dropped from memory:
    FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c')
    del q_gpu
    
    if smoothing == 0:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'),
                                           __pinv_rcond__),
                             FHq_gpu)
    else:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+
                                           (N-1)*smoothing*culinalg.eye(2*M+1,
                                                                        float_type),
                                           __pinv_rcond__),
                             FHq_gpu)
        
    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur/dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu, c_gpu, float_type(bw),
              float_type(dt),
              np.int32(M),
              np.uint32(Nt),
              block=block_dim_t, grid=grid_dim_t)

    return np.real(u_rec_gpu.get())

예제 #8

파일 보기

def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0):
    """
    IAF time decoding machine.
    
    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.
        
    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.

    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    T = 2 * np.pi * M / bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    dev = cumisc.get_current_device()

    # Prepare kernels:
    cache_dir = None
    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_q_ideal = compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = compute_q_mod.get_function('compute_q_leaky')

    compute_F_mod = \
                  SourceModule(compute_F_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_F_ideal = compute_F_mod.get_function('compute_F_ideal')
    compute_F_leaky = compute_F_mod.get_function('compute_F_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into GPU memory:
    s_gpu = gpuarray.to_gpu(s)

    # XXX: Eventually replace this with a PyCUDA equivalent
    ts = np.cumsum(s)
    ts_gpu = gpuarray.to_gpu(ts)

    # Set up GPUArrays for intermediary data. Note that all of the
    # arrays are complex to facilitate use of CUBLAS matrix
    # multiplication functions:
    q_gpu = gpuarray.empty((N - 1, 1), complex_type)
    F_gpu = gpuarray.empty((N - 1, 2 * M + 1), complex_type)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    max_threads_per_block = 256
    block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes(
        dev, q_gpu.shape, max_threads_per_block)
    block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes(
        dev, F_gpu.shape, max_threads_per_block)
    if np.isinf(R):
        compute_q_ideal(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_F_ideal(s_gpu,
                        ts_gpu,
                        F_gpu,
                        float_type(bw),
                        np.int32(M),
                        np.uint32((N - 1) * (2 * M + 1)),
                        block=block_dim_F,
                        grid=grid_dim_F)
    else:
        compute_q_leaky(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(R),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_F_leaky(s_gpu,
                        ts_gpu,
                        F_gpu,
                        float_type(bw),
                        float_type(R),
                        float_type(C),
                        np.int32(M),
                        np.uint32((N - 1) * (2 * M + 1)),
                        block=block_dim_F,
                        grid=grid_dim_F)

    # Compute the product of F^H and q first so that q
    # can be dropped from memory:
    FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c')
    del q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(
            culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__),
            FHq_gpu)
    else:
        c_gpu = culinalg.dot(
            culinalg.pinv(
                culinalg.dot(F_gpu, F_gpu, 'c') +
                (N - 1) * smoothing * culinalg.eye(2 * M + 1, float_type),
                __pinv_rcond__), FHq_gpu)

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur / dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu,
              c_gpu,
              float_type(bw),
              float_type(dt),
              np.int32(M),
              np.uint32(Nt),
              block=block_dim_t,
              grid=grid_dim_t)

    return np.real(u_rec_gpu.get())

예제 #9

파일 보기

파일: iaf_cuda.py 프로젝트: bionet/ted.python

def iaf_encode_pop(u_gpu, dt, b_gpu, d_gpu, R_gpu, C_gpu,
                   y_gpu=None, interval_gpu=None,
                   quad_method='trapz', full_output=False):
    """
    Population IAF time encoding machine.

    Encode a finite length signal with a population of Integrate-and-Fire
    Neurons.

    Parameters
    ----------
    u_gpu : pycuda.gpuarray.GPUArray
        Signal to encode.
    dt : float
        Sampling resolution of input signal; the sampling frequency is
        1/dt Hz.
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    y_gpu : pycuda.gpuarray.GPUArray
        Initial values of integrators.
    interval_gpu : pycuda.gpuarray.GPUArray
        Times since last spike (in s) for each neuron.
    quad_method : {'rect', 'trapz'}
        Quadrature method to use (rectangular or trapezoidal) when the
        neuron is ideal; exponential Euler integration is used
        when the neuron is leaky.
    full_output : bool
        If true, the function returns the updated arrays `y_gpu` and
        `interval_gpu` in addition to the the encoded data block.

    Returns
    -------
    [s_gpu, ns_gpu] : list of pycuda.gpuarray.GPUArray
        If `full_output` is false, returns the encoded signal as a
        matrix `s_gpu` whose rows contain the spike times generated by each
        neuron. The number of spike times in each row is returned in
        `ns_gpu`; all other values in `s_gpu` are set to 0.
    [s_gpu, ns_gpu, y_gpu, interval_gpu] : list of pycuda.gpuarray.GPUArray
        If `full_output` is true, returns the encoded signal
        followed by updated encoder parameters.

    """

    float_type = u_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
    elif float_type == np.float64:
        use_double = 1
    else:
        raise ValueError('unsupported data type')

    # Get the length of the signal:
    Nu = u_gpu.size

    N = b_gpu.size
    if (d_gpu.size != N) or \
           (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')
    if ((y_gpu != None) and (y_gpu.size != N)) or \
       ((interval_gpu != None) and (interval_gpu.size != N)):
        raise ValueError('parameter arrays must be of same length')

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Get required block/grid sizes for running N encoders to process
    # the N signals:
    block_dim, grid_dim = cumisc.select_block_grid_sizes(dev, N,
                          max_threads_per_block)

    # Configure kernel:
    cache_dir = None
    iaf_encode_pop_mod = \
                   SourceModule(iaf_encode_pop_template.substitute(use_double=use_double),
                                cache_dir=cache_dir)
    iaf_encode_pop = iaf_encode_pop_mod.get_function("iaf_encode_pop")

    # Initialize integrator variables if necessary:
    if y_gpu == None:
        y_gpu = gpuarray.zeros(N, float_type)
    if interval_gpu == None:
        interval_gpu = gpuarray.zeros(N, float_type)

    # XXX: A very long s array might cause memory problems:
    s_gpu = gpuarray.zeros((N, Nu), float_type)
    ns_gpu = gpuarray.zeros(N, np.uint32)
    iaf_encode_pop(u_gpu, s_gpu, ns_gpu,
                   float_type(dt), b_gpu, d_gpu,
                   R_gpu, C_gpu,
                   y_gpu, interval_gpu,
                   np.uint32(True if quad_method == 'trapz' else False),
                   np.uint32(Nu),
                   np.uint32(N),
                   block=block_dim, grid=grid_dim)

    if full_output:
        return [s_gpu, ns_gpu, y_gpu, interval_gpu]
    else:
        return [s_gpu, ns_gpu]

예제 #10

파일 보기

파일: iaf_cuda.py 프로젝트: bionet/ted.python

def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0):
    """
    IAF time decoding machine.

    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.

    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.
    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    # Prepare kernels:
    compute_ts_mod = \
                   SourceModule(compute_ts_template.substitute(use_double=use_double))
    compute_ts = \
               compute_ts_mod.get_function('compute_ts')

    compute_tsh_mod = \
                    SourceModule(compute_tsh_template.substitute(use_double=use_double))
    compute_tsh = \
                compute_tsh_mod.get_function('compute_tsh')

    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double))
    compute_q_ideal = \
                    compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = \
                    compute_q_mod.get_function('compute_q_leaky')

    compute_G_mod = \
                  SourceModule(compute_G_template.substitute(use_double=use_double,
                                                             cols=(N-1)),
                               options=['-I', install_headers])
    compute_G_ideal = compute_G_mod.get_function('compute_G_ideal')
    compute_G_leaky = compute_G_mod.get_function('compute_G_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               options=["-I", install_headers])
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into device memory:
    s_gpu = gpuarray.to_gpu(s)

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.empty(N, float_type)
    tsh_gpu = gpuarray.empty(N-1, float_type)
    q_gpu = gpuarray.empty((N-1, 1), complex_type)
    G_gpu = gpuarray.empty((N-1, N-1), complex_type)

    # Get required block/grid sizes for constructing ts, tsh, and q;
    # use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    dev = cumisc.get_current_device()
    max_threads_per_block = 128
    block_dim_s, grid_dim_s = \
                 cumisc.select_block_grid_sizes(dev, s_gpu.shape, max_threads_per_block)

    # Get required block/grid sizes for constructing G:
    block_dim_G, grid_dim_G = \
                 cumisc.select_block_grid_sizes(dev, G_gpu.shape, max_threads_per_block)

    # Run the kernels:
    compute_ts(s_gpu, ts_gpu, np.uint32(N),
               block=block_dim_s, grid=grid_dim_s)
    compute_tsh(ts_gpu, tsh_gpu, np.uint32(N-1),
                block=block_dim_s, grid=grid_dim_s)
    if np.isinf(R):
        compute_q_ideal(s_gpu, q_gpu,
                        float_type(b), float_type(d), float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_G_ideal(ts_gpu, tsh_gpu, G_gpu,
                        float_type(bw), np.uint32((N-1)**2),
                        block=block_dim_G, grid=grid_dim_G)
    else:
        compute_q_leaky(s_gpu, q_gpu,
                        float_type(b), float_type(d),
                        float_type(R), float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_G_leaky(ts_gpu, tsh_gpu, G_gpu,
                        float_type(bw), float_type(R), float_type(C),
                        np.uint32((N-1)**2),
                        block=block_dim_G, grid=grid_dim_G)

    # Free unneeded s and ts to provide more memory to the pinv computation:
    del s_gpu, ts_gpu

    # Compute the reconstruction coefficients:
    c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu)

    # Free unneeded G, G_inv and q:
    del G_gpu, q_gpu

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur/dt))

    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes for constructing u:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu, c_gpu,
              tsh_gpu, float_type(bw), float_type(dt),
              np.uint32(Nt), np.uint32(N-1),
              block=block_dim_t, grid=grid_dim_t)
    u_rec = u_rec_gpu.get()

    return np.real(u_rec)

예제 #11

파일 보기

파일: iaf_cuda.py 프로젝트: bionet/ted.python

def iaf_encode(u, dt, b, d, R=np.inf, C=1.0, dte=0.0, y=0.0, interval=0.0,
               quad_method='trapz', full_output=False):
    """
    IAF time encoding machine.

    Encode a finite length signal with an Integrate-and-Fire neuron.

    Parameters
    ----------
    u : array_like of floats
        Signal to encode.
    dt : float
        Sampling resolution of input signal; the sampling frequency
        is 1/dt Hz.
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    dte : float
        Sampling resolution assumed by the encoder (s).
        This may not exceed `dt`.
    y : float
        Initial value of integrator.
    interval : float
        Time since last spike (in s).
    quad_method : {'rect', 'trapz'}
        Quadrature method to use (rectangular or trapezoidal) when the
        neuron is ideal; exponential Euler integration is used
        when the neuron is leaky.
    full_output : bool
        If set, the function returns the encoded data block followed
        by the given parameters (with updated values for `y` and `interval`).
        This is useful when the function is called repeatedly to
        encode a long signal.

    Returns
    -------
    s : ndarray of floats
        If `full_output` is false, returns the signal encoded as an
        array of interspike intervals.
    [s, dt, b, d, R, C, dte, y, interval, quad_method, full_output] : list
        If `full_output` is true, returns the encoded signal
        followed by updated encoder parameters.

    Notes
    -----
    When trapezoidal integration is used, the value of the integral
    will not be computed for the very last entry in `u`.

    """

    # Input sanity check:
    float_type = u.dtype.type
    if float_type == np.float32:
        use_double = 0
    elif float_type == np.float64:
        use_double = 1
    else:
        raise ValueError('unsupported data type')

    # Handle empty input:
    Nu = len(u)
    if Nu == 0:
        if full_output:
            return array((),float), dt, b, d, R, C, dte, y, interval, \
                   quad_method, full_output
        else:
            return array((),float)

    # Check whether the encoding resolution is finer than that of the
    # original sampled signal:
    if dte > dt:
        raise ValueError('encoding time resolution must not exceeed original signal resolution')
    if dte < 0:
        raise ValueError('encoding time resolution must be nonnegative')
    if dte != 0 and dte != dt:

        # Resample signal and adjust signal length accordingly:
        M = int(dt/dte)
        u = resample(u, len(u)*M)
        Nu *= M
        dt = dte

    dev = cumisc.get_current_device()

    # Configure kernel:
    iaf_encode_mod = \
                   SourceModule(iaf_encode_template.substitute(use_double=use_double))
    iaf_encode = iaf_encode_mod.get_function("iaf_encode")

    # XXX: A very long s array might cause memory problems:
    s = np.zeros(Nu, float_type)
    i_s_0 = np.zeros(1, np.uint32)
    y_0 = np.asarray([y], float_type)
    interval_0 = np.asarray([interval], float_type)
    iaf_encode(drv.In(u), drv.Out(s), drv.InOut(i_s_0),
               float_type(dt), float_type(b),
               float_type(d), float_type(R), float_type(C),
               drv.InOut(y_0), drv.InOut(interval_0),
               np.uint32(True if quad_method == 'trapz' else False),
               np.uint32(Nu),
               block=(1, 1, 1))

    if full_output:
        return s[0:i_s_0[0]], dt, b, d, R, C, y_0[0], interval_0[0], \
               quad_method, full_output
    else:
        return s[0:i_s_0[0]]

예제 #12

파일 보기

파일: iaf_cuda.py 프로젝트: bionet/ted.python

def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu,
                   R_gpu, C_gpu):
    """
    Multiple-input single-output IAF time decoding machine.

    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of sinc kernels.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.

    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.
    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    # Number of spike trains:
    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')

    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    dev = cumisc.get_current_device()
    max_threads_per_block = 128

    # Prepare kernels:
    cache_dir = None
    compute_q_pop_mod = \
        SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_q_ideal_pop = \
                        compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_leaky_pop = \
                        compute_q_pop_mod.get_function('compute_q_leaky')

    compute_ts_pop_mod = \
        SourceModule(compute_ts_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_ts_pop = \
                   compute_ts_pop_mod.get_function('compute_ts')

    compute_tsh_pop_mod = \
        SourceModule(compute_tsh_pop_template.substitute(use_double=use_double),
                     cache_dir=cache_dir)
    compute_tsh_pop = \
                    compute_tsh_pop_mod.get_function('compute_tsh')

    compute_G_pop_mod = \
        SourceModule(compute_G_pop_template.substitute(use_double=use_double),
                     options=['-I', install_headers])
    compute_G_ideal_pop = \
                        compute_G_pop_mod.get_function('compute_G_ideal')
    compute_G_leaky_pop = \
                        compute_G_pop_mod.get_function('compute_G_leaky')

    compute_u_pop_mod = \
        SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                     options=['-I', install_headers])
    compute_u_pop = \
                  compute_u_pop_mod.get_function('compute_u')

    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than 1 interspike interval:
    Nq = int(np.sum(ns)-np.sum(ns>1))

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.zeros_like(s_gpu)
    tsh_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    G_gpu = gpuarray.empty((Nq, Nq), complex_type)

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_G, grid_dim_G = \
                 cumisc.select_block_grid_sizes(dev, G_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu, ns_gpu, ts_gpu,
                   np.uint32(s_gpu.shape[1]), np.uint32(N),
                   block=block_dim_ts, grid=grid_dim_ts)
    compute_tsh_pop(ts_gpu, ns_gpu, tsh_gpu,
                    np.uint32(s_gpu.shape[1]), np.uint32(N),
                    block=block_dim_q, grid=grid_dim_q)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_ideal_pop(s_gpu, q_gpu, b_gpu, d_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_G_ideal_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw),
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(Nq),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(G_gpu.size),
                            block=block_dim_G, grid=grid_dim_G)
    else:
        compute_q_leaky_pop(s_gpu, q_gpu, b_gpu, d_gpu, R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_G_leaky_pop(ts_gpu, tsh_gpu, G_gpu, float_type(bw),
                            R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(Nq),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(G_gpu.size),
                            block=block_dim_G, grid=grid_dim_G)

    # Free unneeded variables:
    del ts_gpu, idx_to_k_gpu

    # Compute the reconstruction coefficients:
    c_gpu = culinalg.dot(culinalg.pinv(G_gpu, __pinv_rcond__), q_gpu)

    # Free G, G_inv, and q:
    del G_gpu, q_gpu

    # Allocate arrays needed for reconstruction:
    Nt = int(np.ceil(dur/dt))

    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes for constructing u:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu, c_gpu, tsh_gpu, ns_gpu,
                  float_type(bw), float_type(dt),
                  np.uint32(s_gpu.shape[1]),
                  np.uint32(N),
                  np.uint32(Nt),
                  block=block_dim_t, grid=grid_dim_t)
    u_rec = u_rec_gpu.get()

    return np.real(u_rec)