def compute_Gb(self, Dsfilename, lamb=0.0): """ compute G matrix using dirichlet coefficients Dsfilename: generated by VTDM_prepb lamb: smoothing parameter \lambda """ handle = la.cublashandle() import tables h5file = tables.openFile(Dsfilename) Ds = h5file.root.real.read() d_Ds = parray.to_gpu(Ds.reshape((Ds.shape[0], -1))) del Ds d_Dsw = parray.empty((d_Ds.shape[0], d_Ds.shape[0]), d_Ds.dtype) if d_Ds.dtype == np.float64: from scikits.cuda.cublas import cublasDgemm gemm = cublasDgemm else: from scikits.cuda.cublas import cublasSgemm gemm = cublasSgemm gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 0.0, d_Dsw.gpudata, d_Dsw.ld) Ds = h5file.root.imag.read() d_Ds.set(Ds) gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 1.0, d_Dsw.gpudata, d_Dsw.ld) del Ds h5file.close() norm_func = get_put_norm_kernel(d_Dsw.dtype) launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0], 1), [d_Dsw, self.d_norm, d_Dsw.ld]) self.d_G = parray.empty((self.size, self.size), self.dtype) G_func = get_G_kernel(self.dtype, d_Dsw.dtype) launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [ self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt, d_Dsw, d_Dsw.ld, self.d_neuron_ind ], timed="G matrix") if lamb != 0: lamb_func = get_diag_add_kernel(self.dtype) launch_kernel( lamb_func, (256, 1, 1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [ self.d_G, self.d_G.ld, self.d_G.shape[0], self.dtype.type(lamb) ])
def compute_G(self, Dswfilename, lamb=0.0): """ compute G matrix using weighting between RFs Dswfilename: generated by VTDM_prep lamb: smoothing parameter \lambda """ Dsw = read_file(Dswfilename) d_Dsw = parray.to_gpu(Dsw) del Dsw #norm_func = get_put_norm_kernel(d_Dsw.dtype) #launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, self.d_norm, d_Dsw.ld]) self.d_G = parray.empty((self.size, self.size), self.dtype) G_func = get_G_kernel(self.dtype, d_Dsw.dtype) launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [ self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt, d_Dsw, d_Dsw.ld, self.d_neuron_ind ], timed="G matrix") if lamb != 0: lamb_func = get_diag_add_kernel(self.dtype) launch_kernel( lamb_func, (256, 1, 1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [ self.d_G, self.d_G.ld, self.d_G.shape[0], self.dtype.type(lamb) ])
def filter(self, video_input): """ Performs RF filtering on input video for all the rfs """ if len(video_input.shape) == 2: # if input has 2 dimensions assert video_input.shape[1] == self.size else: # if input has 3 dimensions assert (video_input.shape[1]*video_input.shape[2] == self.size) # rasterizing inputs video_input.resize((video_input.shape[0], self.size)) d_video = parray.to_gpu(video_input) d_output = parray.empty((self.num_neurons, video_input.shape[0]), self.dtype) free, total = cuda.mem_get_info() self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 // self.size) self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2 self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons) handle = la.cublashandle() for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_filters(startbias=i, N_filters=Nfilters) la.dot(self.filters, d_video, opb='t', C=d_output[i: i+Nfilters], handle=handle) del self.filters return d_output.T()
def reconstruct(self, dirichfilename, time_frame, dt): """ reconstruct video from c = (G)^{+}q dirichfilename: generated by either VTDM_prep or VTDM_prepb time_frame: a tuple or list of 2, in format [start_time, end_time] dt: interval between two consecutive frames in reconstruction Important Note: assumes the solution c is store in self.q """ t = np.arange(time_frame[0], time_frame[1], dt) d_t = parray.to_gpu(t) dirich = read_file(dirichfilename) d_dirich = parray.to_gpu(dirich) del dirich rec_fun = get_reconstruct_kernel(d_dirich.dtype, self.d_q.dtype) u_rec = parray.empty((d_t.size, d_dirich.shape[1], d_dirich.shape[2]), np.float64) launch_kernel(rec_fun, (128,1,1), ((d_dirich.shape[1]*d_dirich.shape[2]-1) / 128+1, d_t.size), [u_rec, u_rec.ld, d_dirich, d_dirich.ld, self.d_tk1, self.d_tk2, self.d_q, d_t, self.d_neuron_ind, self.d_norm, self.Mt, self.Wt/self.Mt, self.size]) return u_rec
def filter(self, V): """ Filter a video V Must set up parameters of CS RF first Parameters ---------- V : 3D ndarray, with shape (num_frames, Px, Py) Returns ------- the filtered output by the gabor filters specified in self output is a PitchArray with shape (num_neurons, num_frames), jth row of which is the output of jth gabor filter """ d_output = parray.empty((self.num_neurons, V.shape[0]), self.dtype) d_video = parray.to_gpu(V.reshape(V.shape[0], V.shape[1]*V.shape[2])) free,total = cuda.mem_get_info() self.ONE_TIME_FILTERS = (free / self.dtype.itemsize) * 3/4 / self.Pxall / self.Pyall handle = la.cublashandle() for i in np.arange(0,self.num_neurons,self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_visual_receptive_fields(startbias = i, N_filters = Nfilters) cublasDgemm(handle.handle, 't','n', V.shape[0], int(Nfilters), self.Pxall*self.Pyall, self.dx*self.dy, d_video.gpudata, d_video.ld, self.filters.gpudata, self.filters.ld, 0, int(int(d_output.gpudata)+int(d_output.ld*i*d_output.dtype.itemsize)) , d_output.ld) return d_output.T()
def compute_Dsw(self, d_Ds, Mx, My, h_norm): """ Compute the weighting matrix of the "correlation" between each two RFs Parameters ---------- d_Ds : PitchArray containing dirichlet coefficient most possibly created by compute_Ds Mx : integer order in the x dimension My : integer order in the y dimension Returns ------- PitchArray with shape (num_neurons, num_neurons) """ if self.dtype == np.complex128: gemm = cublasZgemm else: gemm = cublasCgemm d_weight = parray.empty((self.num_neurons, self.num_neurons), self.dtype) handle = la.cublashandle() gemm(handle.handle, 'c', 'n', self.num_neurons, self.num_neurons, (2*Mx+1)*(2*My+1), 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 0, d_weight.gpudata, d_weight.ld); d_Dsw = d_weight.real() norm_func = get_put_norm_kernel(d_Dsw.dtype) launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, parray.to_gpu(h_norm.astype(np.float64)), d_Dsw.ld]) return d_Dsw
def reconstruct(self, dirichfilename, time_frame, dt): """ reconstruct video from c = (G)^{+}q dirichfilename: generated by either VTDM_prep or VTDM_prepb time_frame: a tuple or list of 2, in format [start_time, end_time] dt: interval between two consecutive frames in reconstruction Important Note: assumes the solution c is store in self.q """ t = np.arange(time_frame[0], time_frame[1], dt) d_t = parray.to_gpu(t) dirich = read_file(dirichfilename) d_dirich = parray.to_gpu(dirich) del dirich rec_fun = get_reconstruct_kernel(d_dirich.dtype, self.d_q.dtype) u_rec = parray.empty((d_t.size, d_dirich.shape[1], d_dirich.shape[2]), np.float64) launch_kernel( rec_fun, (128, 1, 1), ((d_dirich.shape[1] * d_dirich.shape[2] - 1) / 128 + 1, d_t.size), [ u_rec, u_rec.ld, d_dirich, d_dirich.ld, self.d_tk1, self.d_tk2, self.d_q, d_t, self.d_neuron_ind, self.d_norm, self.Mt, self.Wt / self.Mt, self.size ]) return u_rec
def compute_Ds(self, Mx, My): """ Parameters ---------- Mx : integer Order in the x dimension My : integer Order in the y dimension Returns ------- The dirichlet coefficients of all gabor filters with order Mx, My in the format of PitchArray with shape (num_neurons, 2*Mx+1, 2*My+1) """ import scikits.cuda.cufft as cufft d_Ds = parray.empty((self.num_neurons, 2*My+1, 2*Mx+1), self.dtype) ONE_TIME_FILTER = min(1024**3 / (self.Px * self.Py * d_Ds.dtype.itemsize) / 2, self.num_neurons) n = np.asarray((self.Py, self.Px) ,np.int32) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER) fftfunc = cufft.cufftExecZ2Z else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER) fftfunc = cufft.cufftExecC2C fft2Dsfun = get_fft2Ds_kernel(dtype = self.dtype) for i in range(0, self.num_neurons, ONE_TIME_FILTER): N_filters = min(ONE_TIME_FILTER, self.num_neurons - i) self.generate_visual_receptive_fields(startbias = i, N_filters = N_filters) if N_filters < ONE_TIME_FILTER: cufft.cufftDestroy(plan) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters) else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters) #be careful with the side-by-side constraint fftfunc(plan, int(self.filters.gpudata), int(self.filters.gpudata), cufft.CUFFT_FORWARD) launch_kernel(fft2Dsfun, (256, 1, 1), (Mx*2+1, My * 2+1), [[d_Ds, i * d_Ds.ld], self.filters, Mx, My, self.Px, self.Py, N_filters, d_Ds.ld, self.dx*self.dy]); cufft.cufftDestroy(plan) return d_Ds
def compute_dirich_space(self, d_Ds, Mx, My, Px, Py, Sx, Sy, Wx, Wy, x_start = None, y_start = None): """ Compute the spatial reconstruction functions Parameters ---------- d_Ds : PitchArray containing dirichlet coefficient most possibly created by compute_Ds Mx : integer Order in the x dimension My : integer Order in the y dimension Px : integer number of pixels in reconstruction functions in the x dimension Py : integer number of pixels in reconstruction functions in the y dimension Sx : float spatial domain in degree of reconstruction functions in x direction Sy : float spatial domain in degree of reconstruction functions in y direction Wx : float spatial bandwidth in x direction Wy : float spatial bandwidth in y direction x_start : float indicating the starting degree in x direction y_start : float indicating the starting degree in y direction output: PitchArray with shape (num_neurons, Px, Py) """ if self.dtype == np.complex128: typef = np.dtype(np.float64) else: typef = np.dtype(np.float32) dirich_fun = get_dirich_space_kernel(self.dtype, typef) d_dirich = parray.empty((self.num_neurons, Py, Px),typef) if x_start is None: x_start = - Sx/ 2 if y_start is None: y_start = - Sy/2 BLOCKSIZE = 16 launch_kernel(dirich_fun,(BLOCKSIZE, BLOCKSIZE, 1), (((Px-1) / BLOCKSIZE+1) * ((Py-1) / BLOCKSIZE+1), self.num_neurons), [d_dirich, d_dirich.ld, d_Ds, d_Ds.ld, Px, Py, Mx, My, Sx, Sy, x_start, y_start, Wx / Mx, Wy / My], shared = d_Ds.dtype.itemsize * (2*Mx+1), timed = "dirich") return d_dirich
def compute_q(self): """ compute q """ self.d_q = parray.empty((self.size, 1), self.dtype) q_func = get_compute_q_kernel(self.dtype) launch_kernel( q_func, (256, 1, 1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [ self.d_q, self.d_tk1, self.d_tk2, self.d_neuron_ind, self.d_kappa, self.d_delta, self.d_bias, self.d_norm, self.size ])
def generate_filters(self, N_filters=None, startbias=0): """ Generate a batch of filters from parameters set in self start_bias: start from the (start_bias)th filter N_filters: generate N_filters filters """ assert self.gpu_loaded if N_filters is None: N_filters = self.num_neurons - startbias if hasattr(self, 'filters'): if N_filters != self.filters.shape[0]: delattr(self, 'filters') self.filters = parray.empty( (N_filters, self.size), self.dtype) else: self.filters = parray.empty( (N_filters, self.size), self.dtype) self._call_filter_func(N_filters, startbias)
def compute_Gb(self, Dsfilename, lamb=0.0): """ compute G matrix using dirichlet coefficients Dsfilename: generated by VTDM_prepb lamb: smoothing parameter \lambda """ handle = la.cublashandle() import tables h5file = tables.openFile(Dsfilename) Ds = h5file.root.real.read() d_Ds = parray.to_gpu(Ds.reshape((Ds.shape[0],-1))) del Ds d_Dsw = parray.empty((d_Ds.shape[0], d_Ds.shape[0]), d_Ds.dtype) if d_Ds.dtype == np.float64: from scikits.cuda.cublas import cublasDgemm gemm = cublasDgemm else: from scikits.cuda.cublas import cublasSgemm gemm = cublasSgemm gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 0.0, d_Dsw.gpudata, d_Dsw.ld) Ds = h5file.root.imag.read() d_Ds.set(Ds) gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 1.0, d_Dsw.gpudata, d_Dsw.ld) del Ds h5file.close() norm_func = get_put_norm_kernel(d_Dsw.dtype) launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, self.d_norm, d_Dsw.ld]) self.d_G = parray.empty((self.size, self.size), self.dtype) G_func = get_G_kernel(self.dtype, d_Dsw.dtype) launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt, d_Dsw, d_Dsw.ld, self.d_neuron_ind], timed = "G matrix") if lamb != 0: lamb_func = get_diag_add_kernel(self.dtype) launch_kernel(lamb_func, (256,1,1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [self.d_G, self.d_G.ld, self.d_G.shape[0], self.dtype.type(lamb)])
def generate_visual_receptive_fields(self, startbias = 0, N_filters = None, x_start = None, y_start = None): """ Generate a batch of centre surround filters from parameters set in self Parameters ---------- start_bias : integer, optional start the the (start_bias)th filter N_filters : integer, optional generate N_filters filters x_start : float indicating the starting degree in x direction y_start : float indicating the starting degree in y direction """ if N_filters is None: N_filters = self.num_neurons - startbias try: if N_filters > self.filters.shape[0]: del self.filters self.filters = parray.empty((N_filters, self.Pyall, self.Pxall), self.dtype) except: self.filters = parray.empty((N_filters, self.Pyall, self.Pxall), self.dtype) if x_start is None: x_start = - self.Sxall/ 2 if y_start is None: y_start = -self.Syall/2 BLOCK_SIZE = 16 launch_kernel(self.func, (BLOCK_SIZE, BLOCK_SIZE, 1), (((self.Pxall-1)/BLOCK_SIZE+1) * ((self.Pyall-1)/BLOCK_SIZE+1), int(N_filters)), [self.filters, self.filters.ld, [self.d_alpha, startbias], [self.d_x0, startbias], [self.d_y0, startbias], self.Pxall, self.Pyall, self.Sxall, self.Syall, x_start, y_start, self.sigma_center**2, self.sigma_surround**2])
def __init__(self, num_gpot_neurons, gpot_delay_steps, rest, num_spike_neurons, spike_delay_steps): self.num_gpot_neurons = num_gpot_neurons if num_gpot_neurons > 0: self.dtype = np.double self.gpot_delay_steps = gpot_delay_steps self.gpot_buffer = parray.empty((gpot_delay_steps, num_gpot_neurons),np.double) self.gpot_current = 0 for i in range(gpot_delay_steps): cuda.memcpy_dtod(int(self.gpot_buffer.gpudata) + \ self.gpot_buffer.ld * i * self.gpot_buffer.dtype.itemsize,\ rest.gpudata, rest.nbytes) self.num_spike_neurons = num_spike_neurons if num_spike_neurons > 0: self.spike_delay_steps = spike_delay_steps self.spike_buffer = parray.zeros((spike_delay_steps,num_spike_neurons),np.int32) self.spike_current = 0
def compute_G(self, Dswfilename, lamb=0.0): """ compute G matrix using weighting between RFs Dswfilename: generated by VTDM_prep lamb: smoothing parameter \lambda """ Dsw = read_file(Dswfilename) d_Dsw = parray.to_gpu(Dsw) del Dsw #norm_func = get_put_norm_kernel(d_Dsw.dtype) #launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, self.d_norm, d_Dsw.ld]) self.d_G = parray.empty((self.size, self.size), self.dtype) G_func = get_G_kernel(self.dtype, d_Dsw.dtype) launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt, d_Dsw, d_Dsw.ld, self.d_neuron_ind], timed = "G matrix") if lamb != 0: lamb_func = get_diag_add_kernel(self.dtype) launch_kernel(lamb_func, (256,1,1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [self.d_G, self.d_G.ld, self.d_G.shape[0], self.dtype.type(lamb)])
def filter_image(self, image_input): """ Performs RF filtering on input video for all the rfs """ # video dimensions should match screen dimensions # numpy resize operation doesn,t make any checks if len(image_input.shape) == 2: # if input has 2 dimensions assert image_input.shape[1] == self.size else: # if input has 3 dimensions assert (image_input.shape[1]*image_input.shape[2] == self.size) # rasterizing inputs image_input.resize((1, self.size)) d_image = parray.to_gpu(image_input) d_output = parray.empty((self.num_neurons, image_input.shape[0]), self.dtype) free, total = cuda.mem_get_info() self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 // self.size) self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2 self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons) handle = la.cublashandle() for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_filters(startbias=i, N_filters=Nfilters) la.dot(self.filters, d_image, opb='t', C=d_output[i: i+Nfilters], handle=handle) del self.filters return d_output.T()
def __init__(self, num_gpot_neurons, gpot_delay_steps, rest, num_spike_neurons, spike_delay_steps): self.num_gpot_neurons = num_gpot_neurons if num_gpot_neurons > 0: self.dtype = np.double self.gpot_delay_steps = gpot_delay_steps self.gpot_buffer = parray.empty( (gpot_delay_steps, num_gpot_neurons), np.double) self.gpot_current = 0 for i in range(gpot_delay_steps): cuda.memcpy_dtod( int(self.gpot_buffer.gpudata) + self.gpot_buffer.ld * i * self.gpot_buffer.dtype.itemsize, rest.gpudata, rest.nbytes) self.num_spike_neurons = num_spike_neurons if num_spike_neurons > 0: self.spike_delay_steps = spike_delay_steps self.spike_buffer = parray.zeros( (spike_delay_steps, num_spike_neurons), np.int32) self.spike_current = 0
def compute_q(self ): """ compute q """ self.d_q = parray.empty((self.size,1), self.dtype) q_func = get_compute_q_kernel(self.dtype) launch_kernel(q_func, (256, 1, 1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [self.d_q, self.d_tk1, self.d_tk2, self.d_neuron_ind, self.d_kappa, self.d_delta, self.d_bias, self.d_norm, self.size])
def rnn3(G, q, dt=1e-6, alpha=5000, steps=4000, XOUTPUTSTEPS=None): """ Solving the decoding problem using a recurrent neural network. Parameters ---------- G: PitchArray Must be real and positive semidefinite. q: PitchArray The measurements from spikes dt: float (optional) the time step in simulating the continuous network alpha: float (optional) scaling factor steps: int (optional) the number of steps to run the network XOUTPUTSTEPS: int (optional) The number of steps that are returned. If using default None, only return the final result. Return ------ c: PitchArray The approximate solution to the decoding problem output: PitchArray (optional) If XOUTPUTSTEPS is not None, the full output specified """ if G.dtype != q.dtype: raise TypeError("matrix multiplication must have same dtype") if np.iscomplexobj(G): raise TypeError("RNN currently only solves real types") if (len(G.shape) != 2) | (len(q.shape) != 2): raise TypeError("G, q must both be matrices") if XOUTPUTSTEPS is None: XOUTPUTSTEPS = min(20, steps) x_steps = steps / XOUTPUTSTEPS fullout = False else: fullout = True x_steps = steps / int(XOUTPUTSTEPS) output = parray.empty((XOUTPUTSTEPS, q.size), q.dtype) c = parray.zeros_like(q) update_func = get_rnn3_update_func(G.dtype) dt = float(dt) alpha = float(alpha) y = parray.empty_like(q) if y.dtype == np.float64: normfunc = cublasDnrm2 else: normfunc = cublasSnrm2 grid = (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1) handle = la.cublashandle() start = time.time() for i in range(0, steps + 1): Gc = la.dot(G, c, handle=handle) launch_kernel(update_func, (256, 1, 1), grid, [c, dt * alpha, q, Gc, y, c.size, 1], prepared=True) if i % x_steps == 0: ynorm = normfunc(handle.handle, y.size, y.gpudata, 1) print "%d, norm = %.10f, time=%f(ms)" % (i / x_steps, ynorm, (time.time() - start) * 1000) if fullout: cuda.memcpy_dtod( int(output.gpudata) + output.dtype.itemsize * output.ld * int(i / x_steps - 1), c.gpudata, c.dtype.itemsize * c.size) #cuda.memcpy_dtod(q.gpudata, c.gpudata, c.dtype.itemsize*c.size) if fullout: return c, output else: return c
filter_func = _get_gaussian_cylinder(dtype) # Constants S1 = 128 S2 = 128 PHOTORECEPTORS = 8 M_size = S1*S2 # same as grid[0].size N_filters = PHOTORECEPTORS RAD = 1 KAPPA = 20 SIGMA = 1 # or angle NTHREADS = (128, 1, 1) NBLOCKS = ((M_size-1) // NTHREADS[0] + 1, 1) d_filters = parray.empty((N_filters, M_size), dtype) grid = np.meshgrid(np.linspace(-1, 1, num=S1), np.linspace(-np.pi, np.pi, num=S2)) d_grid = [parray.to_gpu(grid[i].flatten()) for i in range(len(grid))] dxy = np.diff(grid[0][0, :2])*np.diff(grid[1][:2, 0])[0] ref_z = 2*random.rand(PHOTORECEPTORS)-1 # -1 to 1 d_refz = parray.to_gpu(ref_z) ref_theta = np.pi*random.rand(PHOTORECEPTORS) # half cylinder d_reftheta = parray.to_gpu(ref_theta) filter_func.prepared_call( NBLOCKS, NTHREADS, d_filters.gpudata,
def compute_dirich_space_fft(self, d_Ds, Mx, My, Px, Py, Sx, Sy, Wx, Wy): import scikits.cuda.cufft as cufft dx = Sx / Px dy = Sy / Py Px1 = int(np.round(self.Sx / dx)) Py1 = int(np.round(self.Sy / dy)) if self.dtype == np.complex128: typef = np.dtype(np.float64) else: typef = np.dtype(np.float32) d_dirich = parray.empty((self.num_neurons, Py, Px),typef) freemem,totalmem = cuda.mem_get_info() ONE_TIME_FILTER = int(min(freemem / (Px1 * Py1 * d_Ds.dtype.itemsize) / 4, self.num_neurons)) n = np.asarray((Py1, Px1) ,np.int32) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, ONE_TIME_FILTER) fftfunc = cufft.cufftExecZ2Z else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, ONE_TIME_FILTER) fftfunc = cufft.cufftExecC2C Ds2fftfun = get_Ds2fft_kernel(self.dtype) d_filter_complex = parray.empty((ONE_TIME_FILTER, Px1*Py1), self.dtype) filter2recfun = get_filter2rec_kernel(self.dtype) for i in range(0, self.num_neurons, ONE_TIME_FILTER): N_filters = min(ONE_TIME_FILTER, self.num_neurons - i) d_filter_complex.fill(0) launch_kernel(Ds2fftfun, (256,1,1), (Mx*2+1, My*2+1), [[d_Ds,i * d_Ds.ld], d_Ds.ld, d_filter_complex, d_filter_complex.ld, Mx, My, Px1, Py1, N_filters]) if N_filters < ONE_TIME_FILTER: cufft.cufftDestroy(plan) if self.dtype == np.complex128: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_Z2Z, N_filters) else: plan = cufft.cufftPlanMany(2, n.ctypes.data, None, 1, 0, None, 1, 0, cufft.CUFFT_C2C, N_filters) #be careful with the side-by-side constraint fftfunc(plan, int(d_filter_complex.gpudata), int(d_filter_complex.gpudata), cufft.CUFFT_INVERSE) BLOCK_SIZE = 16 launch_kernel(filter2recfun, (BLOCK_SIZE,BLOCK_SIZE,1), (((Px-1) / BLOCK_SIZE + 1)* ((Py-1) / BLOCK_SIZE+1), N_filters), [[d_dirich,i * d_dirich.ld],d_dirich.ld, d_filter_complex, d_filter_complex.ld, N_filters, Px, Py, Px1, Py1]) cufft.cufftDestroy(plan) return d_dirich
def encode(self, neural_inputs, startbias = 0, avg_rate = 0.1): """ Encode with IAFs with random thresholds Parameters ---------- neural_inputs : PitchArray PitchArray of shape (num_samples, num_neurons) containing inputs to all neurons startbias : integer the neuron index corresponding to first column of neural_inputs avg_rate : float average spiking rate assumed for neurons, will allocate memory num_samples/avg_rate for each neuron for storing spikes Returns ------- spikes : ndarray of self.dtype stores the spikes for one neuron after another spike_count : ndarray of int32 of size num_neurons indicates the number of spikes generated by each neuron Notes ----- spikes for neuron j can be accessed by :: cum_count = np.concatenate((np.zeros(1,np.int32),np.cumsum(spike_count))) tk = spikes[cum_count[j]:cum_count[j+1]] """ neuron_per_block=64 if self.num_neurons != neural_inputs.shape[1]: raise ValueError("input size should match number of neurons") Ntimesteps = neural_inputs.shape[0] d_spikecount = parray.empty((1, self.num_neurons), np.int32) randnum = np.random.normal(size = ( int(np.ceil(Ntimesteps / avg_rate)), self.num_neurons)).astype(self.dtype) #d_spike = parray.empty( ( int(np.ceil(Ntimesteps / avg_rate)), self.num_neurons), self.dtype) d_spike = parray.to_gpu(randnum) if neural_inputs.__class__ is np.ndarray: d_neural_inputs = parray.to_gpu(neural_inputs) else: d_neural_inputs = neural_inputs launch_kernel(self.func, (neuron_per_block, 1, 1), (int(np.ceil(np.float64(self.num_neurons) / neuron_per_block)), 1), \ [neural_inputs, neural_inputs.ld, self.num_neurons, Ntimesteps, d_spike, d_spike.ld, [self.d_v0, startbias], \ [self.d_kappa, startbias], [self.d_bias, startbias], [self.d_delta, startbias], [self.d_time_count, startbias],\ d_spikecount, int(np.ceil(Ntimesteps / avg_rate)), self.dt, [self.d_delta_value,startbias], [self.d_sigma,startbias]], shared = self.dtype.itemsize * neuron_per_block) spike_count = d_spikecount.get() spike_count.resize((self.num_neurons,)) if spike_count.max() >= np.ceil(Ntimesteps / avg_rate): raise ValueError("number of spikes exceeded the limit of buffer") spike = rearrange_spikes(d_spike, spike_count, self.num_neurons) return spike, spike_count
def rnn3(G, q, dt = 1e-6, alpha = 5000, steps = 4000, XOUTPUTSTEPS = None): """ Solving the decoding problem using a recurrent neural network. Parameters ---------- G: PitchArray Must be real and positive semidefinite. q: PitchArray The measurements from spikes dt: float (optional) the time step in simulating the continuous network alpha: float (optional) scaling factor steps: int (optional) the number of steps to run the network XOUTPUTSTEPS: int (optional) The number of steps that are returned. If using default None, only return the final result. Return ------ c: PitchArray The approximate solution to the decoding problem output: PitchArray (optional) If XOUTPUTSTEPS is not None, the full output specified """ if G.dtype != q.dtype: raise TypeError("matrix multiplication must have same dtype") if np.iscomplexobj(G): raise TypeError("RNN currently only solves real types") if (len(G.shape) != 2) | (len(q.shape) != 2): raise TypeError("G, q must both be matrices") if XOUTPUTSTEPS is None: XOUTPUTSTEPS = min(20, steps) x_steps = steps / XOUTPUTSTEPS fullout = False else: fullout = True x_steps = steps / int(XOUTPUTSTEPS) output = parray.empty((XOUTPUTSTEPS, q.size), q.dtype) c = parray.zeros_like(q) update_func = get_rnn3_update_func(G.dtype) dt = float(dt) alpha = float(alpha) y = parray.empty_like(q) if y.dtype == np.float64: normfunc = cublasDnrm2 else: normfunc = cublasSnrm2 grid = (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1) handle = la.cublashandle() start = time.time() for i in range(0,steps+1): Gc = la.dot(G, c, handle = handle) launch_kernel(update_func, (256,1,1), grid, [c, dt*alpha, q, Gc, y, c.size, 1], prepared = True) if i%x_steps == 0: ynorm = normfunc(handle.handle, y.size, y.gpudata, 1) print "%d, norm = %.10f, time=%f(ms)" % (i / x_steps, ynorm, (time.time()-start)*1000); if fullout: cuda.memcpy_dtod( int(output.gpudata) + output.dtype.itemsize*output.ld*int(i/x_steps-1), c.gpudata, c.dtype.itemsize * c.size) #cuda.memcpy_dtod(q.gpudata, c.gpudata, c.dtype.itemsize*c.size) if fullout: return c,output else: return c