def prepare_for_train(data, label): assert len(data.shape) == 4 if data.shape[3] != self.batchSize: self.batchSize = data.shape[3] for l in self.layers: l.change_batch_size(self.batchSize) self.inputShapes = None self.imgShapes = None self.outputs = [] self.grads = [] self.local_outputs = [] self.local_grads = [] self.imgShapes = [(self.numColor, self.imgSize / 2, self.imgSize / 2, self.batchSize)] self.inputShapes = [(self.numColr * (self.imgSize ** 2) / 4, self.batchSize)] fc = False for layer in self.layers: outputShape = layer.get_output_shape() row = outputShape[0] * outputShape[1] * outputShape[2] col = outputShape[3] if layer.type == 'softmax': row *= comm.Get_size() outputShape = (outputShape[0] * comm.Get_size(), 1, 1, outputShape[3]) self.inputShapes.append((row, col)) self.imgShapes.append(outputShape) area = make_area(outputShape) self.outputs.append(virtual_array(rank, area = area)) self.local_outputs.append(gpuarray.zeros((row, col), dtype =np.float32)) inputShape = self.inputShapes[-2] #if layer.type == 'fc': # inputShape = (inputShape[0] * comm.Get_size(), inputShape[1]) # self.local_grads.append(gpuarray.zeors(inputShape, dtype = np.float32)) # area = make_plain_area(inputShape) #else: # self.local_grads.append(gpuarray.zeros(inputShape, dtype= np.float32)) # area = make_area(self.imgShapes[-2]) #self.grads.append(virtual_array(rank, area = area)) area = make_area((self.numColor, self.imgSize / 2, self.imgSize / 2, self.batchSize)) self.data = virtual_array(rank, local = gpuarray.to_gpu(data.__getitem__(area.to_slice())), area = area) if not isinstance(label, GPUArray): self.label = gpuarray.to_gpu(label).astype(np.float32) else: self.label = label self.label = self.label.reshape((label.size, 1)) self.numCase += data.shape[1] outputShape = self.inputShapes[-1] if self.output is None or self.output.shape != outputShape: self.output = gpuarray.zeros(outputShape, dtype = np.float32)
def riemanntheta_high_dim(X, Yinv, T, z, g, rad, max_points = 10000000): parRiemann = RiemannThetaCuda(1,512) #initialize parRiemann parRiemann.compile(g) parRiemann.cache_omega_real(X) parRiemann.cache_omega_imag(Yinv,T) #compile the box_points program point_finder = func1() R = get_rad(T, rad) print R num_int_points = (2*R + 1)**g num_partitions = num_int_points//max_points num_final_partition = num_int_points - num_partitions*max_points osc_part = 0 + 0*1.j if (num_partitions > 0): S = gpuarray.zeros(np.int(max_points * g), dtype=np.double) print "Required number of iterations" print num_partitions print for p in range(num_partitions): print p print S = box_points(point_finder, max_points*p, max_points*(p+1),g,R, S) parRiemann.cache_intpoints(S, gpu_already=True) osc_part += parRiemann.compute_v_without_derivs(np.array([z])) S = gpuarray.zeros(np.int((num_int_points - num_partitions*max_points)*g), dtype = np.double) print num_partitions*max_points,num_int_points S = box_points(point_finder, num_partitions*max_points, num_int_points, g, R,S) parRiemann.cache_intpoints(S,gpu_already = True) osc_part += parRiemann.compute_v_without_derivs(np.array([z])) print osc_part return osc_part
def compute_v_without_derivs(self, Xs, Yinvs, Ts): #Turn the parts of omega into gpuarrays Xs = np.require(Xs, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Yinvs = np.require(Yinvs, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Ts = np.require(Ts, dtype = np.double, requirements=['A', 'W', 'O', 'C']) Xs_d = gpuarray.to_gpu(Xs) Yinvs_d = gpuarray.to_gpu(Yinvs) Ts_d = gpuarray.to_gpu(Ts) #Determine N = the number of integer points to sum over # K = the number of different omegas to compute the function at N = self.Sd.size/self.g K = Xs.size/(self.g**2) #Create room on the gpu for the real and imaginary finite sum calculations fsum_reald = gpuarray.zeros(N*K, dtype=np.double) fsum_imagd = gpuarray.zeros(N*K, dtype=np.double) #Turn all scalars into numpy data types Nd = np.int32(N) Kd = np.int32(K) gd = np.int32(self.g) blocksize = (self.tilewidth, self.tileheight, 1) gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1) self.finite_sum_without_derivs(fsum_reald, fsum_imagd, Xs_d, Yinvs_d, Ts_d, self.Sd, gd, Nd, Kd, block = blocksize, grid = gridsize) cuda.Context.synchronize() fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd) fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd) return fsums_real + 1.0j*fsums_imag
def get_next_batch(self, batch_size): if self._reader is None: self._start_read() if self._gpu_batch is None: self._fill_reserved_data() height, width = self._gpu_batch.data.shape gpu_data = self._gpu_batch.data gpu_labels = self._gpu_batch.labels if self.index + batch_size >= width: width = width - self.index labels = gpu_labels[self.index:self.index + batch_size] #data = gpu_data[:, self.index:self.index + batch_size] data = gpuarray.zeros((height, width), dtype = np.float32) gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + width) self.index = 0 self._fill_reserved_data() else: labels = gpu_labels[self.index:self.index + batch_size] #data = gpu_data[:, self.index:self.index + batch_size] data = gpuarray.zeros((height, batch_size), dtype = np.float32) gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + batch_size) #labels = gpu_labels[self.index:self.index + batch_size] self.index += batch_size return BatchData(data, labels, self._gpu_batch.epoch)
def _initialize_gpu_ds(self): """ Setup GPU arrays. """ self.synapse_state = garray.zeros(int(self.total_synapses) + \ len(self.input_neuron_list), np.float64) if self.my_num_gpot_neurons>0: self.V = garray.zeros(int(self.my_num_gpot_neurons), np.float64) else: self.V = None if self.my_num_spike_neurons>0: self.spike_state = garray.zeros(int(self.my_num_spike_neurons), np.int32) if len(self.public_gpot_list)>0: self.public_gpot_list_g = garray.to_gpu(self.public_gpot_list) self.projection_gpot = garray.zeros(len(self.public_gpot_list), np.double) self._extract_gpot = self._extract_projection_gpot_func() if len(self.public_spike_list)>0: self.public_spike_list_g = garray.to_gpu( \ (self.public_spike_list-self.spike_shift).astype(np.int32)) self.projection_spike = garray.zeros(len(self.public_spike_list), np.int32) self._extract_spike = self._extract_projection_spike_func()
def update_ptrs(self): self.tps_param_ptrs = get_gpu_ptrs(self.tps_params) self.trans_d_ptrs = get_gpu_ptrs(self.trans_d) self.lin_dd_ptrs = get_gpu_ptrs(self.lin_dd) self.w_nd_ptrs = get_gpu_ptrs(self.w_nd) for b in self.bend_coefs: self.proj_mat_ptrs[b] = get_gpu_ptrs(self.proj_mats[b]) self.offset_mat_ptrs[b] = get_gpu_ptrs(self.offset_mats[b]) self.pt_ptrs = get_gpu_ptrs(self.pts) self.kernel_ptrs = get_gpu_ptrs(self.kernels) self.pt_w_ptrs = get_gpu_ptrs(self.pts_w) self.pt_t_ptrs = get_gpu_ptrs(self.pts_t) self.corr_cm_ptrs = get_gpu_ptrs(self.corr_cm) self.corr_rm_ptrs = get_gpu_ptrs(self.corr_rm) self.r_coef_ptrs = get_gpu_ptrs(self.r_coefs) self.c_coef_rn_ptrs = get_gpu_ptrs(self.c_coefs_rn) self.c_coef_cn_ptrs = get_gpu_ptrs(self.c_coefs_cn) # temporary space for warping cost computations self.warp_err = gpuarray.zeros((self.N, MAX_CLD_SIZE), np.float32) self.bend_res_mat = gpuarray.zeros((DATA_DIM * self.N, DATA_DIM), np.float32) self.bend_res = [self.bend_res_mat[i * DATA_DIM : (i + 1) * DATA_DIM] for i in range(self.N)] self.bend_res_ptrs = get_gpu_ptrs(self.bend_res) self.dims_gpu = gpuarray.to_gpu(np.array(self.dims, dtype=np.int32)) self.ptrs_valid = True
def compute_v_without_derivs(self, Z): #Turn the numpy set Z into gpuarrays x = Z.real y = Z.imag x = np.require(x, dtype = np.double, requirements=['A','W','O','C']) y = np.require(y, dtype = np.double, requirements=['A','W','O','C']) xd = gpuarray.to_gpu(x) yd = gpuarray.to_gpu(y) self.yd = yd #Detemine N = the number of integer points to sum over and # K = the number of values to compute the function at N = self.Sd.size/self.g K = Z.size/self.g #Create room on the gpu for the real and imaginary finite sum calculations fsum_reald = gpuarray.zeros(N*K, dtype=np.double) fsum_imagd = gpuarray.zeros(N*K, dtype=np.double) #Make all scalars into numpy data types Nd = np.int32(N) Kd = np.int32(K) gd = np.int32(self.g) blocksize = (self.tilewidth, self.tileheight, 1) gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1) self.finite_sum_without_derivs(fsum_reald, fsum_imagd, xd, yd, self.Sd, gd, Nd, Kd, block = blocksize, grid = gridsize) cuda.Context.synchronize() fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd) fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd) return fsums_real + 1.0j*fsums_imag
def logreg_cost(self, label, output): if self.cost.shape[0] != self.batchSize: self.cost = gpuarray.zeros((self.batchSize, 1), dtype=np.float32) maxid = gpuarray.zeros((self.batchSize, 1), dtype=np.float32) find_col_max_id(maxid, output) self.batchCorrect = same_reduce(label , maxid) logreg_cost_col_reduce(output, label, self.cost)
def _initialize_gpu_ds(self): """ Setup GPU arrays. """ self.synapse_state = garray.zeros(max(int(self.total_synapses) + len(self.input_neuron_list), 1), np.float64) if self.total_num_gpot_neurons > 0: self.V = garray.zeros(int(self.total_num_gpot_neurons), np.float64) else: self.V = None if self.total_num_spike_neurons > 0: self.spike_state = garray.zeros(int(self.total_num_spike_neurons), np.int32) self.block_extract = (256, 1, 1) if len(self.out_ports_ids_gpot) > 0: self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot) self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids) self._extract_gpot = self._extract_projection_gpot_func() if len(self.out_ports_ids_spk) > 0: self.out_ports_ids_spk_g = garray.to_gpu((self.out_ports_ids_spk - self.spike_shift).astype(np.int32)) self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids) self._extract_spike = self._extract_projection_spike_func() if self.ports_in_gpot_mem_ind is not None: inds = self.sel_in_gpot_ids self.inds_gpot = garray.to_gpu(inds) if self.ports_in_spk_mem_ind is not None: inds = self.sel_in_spk_ids self.inds_spike = garray.to_gpu(inds)
def __init__( self, s_dict, synapse_state, dt, debug=False): self.debug = debug self.dt = dt self.num = len( s_dict['id'] ) self.pre = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 )) self.ar = garray.to_gpu( np.asarray( s_dict['ar'], dtype=np.float64 )) self.ad = garray.to_gpu( np.asarray( s_dict['ad'], dtype=np.float64 )) self.gmax = garray.to_gpu( np.asarray( s_dict['gmax'], dtype=np.float64 )) self.a0 = garray.zeros( (self.num,), dtype=np.float64 ) self.a1 = garray.zeros( (self.num,), dtype=np.float64 ) self.a2 = garray.zeros( (self.num,), dtype=np.float64 ) self.cond = synapse_state _num_dendrite_cond = np.asarray( [s_dict['num_dendrites_cond'][i] for i in s_dict['id']],\ dtype=np.int32).flatten() _num_dendrite = np.asarray( [s_dict['num_dendrites_I'][i] for i in s_dict['id']],\ dtype=np.int32).flatten() self._cum_num_dendrite = garray.to_gpu(_0_cumsum(_num_dendrite)) self._cum_num_dendrite_cond = garray.to_gpu(_0_cumsum(_num_dendrite_cond)) self._num_dendrite = garray.to_gpu(_num_dendrite) self._num_dendrite_cond = garray.to_gpu(_num_dendrite_cond) self._pre = garray.to_gpu(np.asarray(s_dict['I_pre'], dtype=np.int32)) self._cond_pre = garray.to_gpu(np.asarray(s_dict['cond_pre'], dtype=np.int32)) self._V_rev = garray.to_gpu(np.asarray(s_dict['reverse'],dtype=np.double)) self.I = garray.zeros(self.num, np.double) #self._update_I_cond = self._get_update_I_cond_func() self._update_I_non_cond = self._get_update_I_non_cond_func() self.update = self._get_gpu_kernel()
def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, trange, min_qwidth, qrange, min_bin_content=10, time_only=True): """Setup GPU arrays to compute PDF values for the given event. The pdf_eval calculation allows the PDF to be evaluated at a single point for each channel as the Monte Carlo is run. The effective bin size will be as small as (`min_twidth`, `min_qwidth`) around the point of interest, but will be large enough to ensure that `min_bin_content` Monte Carlo events fall into the bin. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. min_twidth: float Minimum bin size in the time dimension trange: (float, float) Range of time dimension in PDF min_qwidth: float Minimum bin size in charge dimension qrange: (float, float) Range of charge dimension in PDF min_bin_content: int The bin will be expanded to include at least this many events time_only: bool If True, only the time observable will be used in the PDF. """ self.event_nhit = count_nonzero(event_hit) # Define a mapping from an array of len(event_hit) to an array of length event_nhit self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32) self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id) self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset) self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content assert time_only # Only support time right now self.time_only = time_only
def fprop(self, input, output): max = gpuarray.zeros((1, self.batchSize), dtype = np.float32) col_max_reduce(max, input) add_vec_to_cols(input, max, output, alpha = -1) gpu_copy_to(cumath.exp(output), output) sum = gpuarray.zeros(max.shape, dtype = np.float32) add_col_sum_to_vec(sum, output, alpha = 0) div_vec_to_cols(output, sum)
def createHashTable(kd, vd, capacity): table_capacity_gpu, _ = mod.get_global('table_capacity') cuda.memcpy_htod(table_capacity_gpu, np.uint([capacity])) # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_capacity, # &capacity, # sizeof(unsigned int))); table_vals_gpu, table_vals_size = mod.get_global('table_values') # pointer-2-pointer values_gpu = gpuarray.zeros((capacity*vd,1), dtype=np.float32) # values_gpu = gpuarray.zeros((capacity*vd,1), dtype=np.float32) # cuda.memset_d32(values_gpu.gpudata, 0, values_gpu.size) cuda.memcpy_dtod(table_vals_gpu, values_gpu.gpudata, table_vals_size) # float *values; # allocateCudaMemory((void**)&values, capacity*vd*sizeof(float)); # CUDA_SAFE_CALL(cudaMemset((void *)values, 0, capacity*vd*sizeof(float))); # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_values, # &values, # sizeof(float *))); table_entries, table_entries_size = mod.get_global('table_entries') entries_gpu = gpuarray.empty((capacity*2,1), dtype=np.int) entries_gpu.fill(-1) # cuda.memset_d32(entries_gpu.gpudata, 1, entries_gpu.size) cuda.memcpy_dtod(table_entries, entries_gpu.gpudata, table_entries_size) # int *entries; # allocateCudaMemory((void **)&entries, capacity*2*sizeof(int)); # CUDA_SAFE_CALL(cudaMemset((void *)entries, -1, capacity*2*sizeof(int))); # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_entries, # &entries, # sizeof(unsigned int *))); ######################################## # Assuming LINEAR_D_MEMORY not defined # ######################################## # #ifdef LINEAR_D_MEMORY # char *ranks; # allocateCudaMemory((void**)&ranks, capacity*sizeof(char)); # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_rank, # &ranks, # sizeof(char *))); # # signed short *zeros; # allocateCudaMemory((void**)&zeros, capacity*sizeof(signed short)); # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_zeros, # &zeros, # sizeof(char *))); # # #else table_keys_gpu, table_keys_size = mod.get_global('table_keys') keys_gpu = gpuarray.zeros((capacity*kd,1), dtype=np.short) # keys_gpu = gpuarray.empty((capacity*kd,1), dtype=np.short) # cuda.memset_d32(keys_gpu.gpudata, 0, keys_gpu.size) cuda.memcpy_dtod(table_keys_gpu, keys_gpu.gpudata, table_keys_size)
def logreg_cost_multiview(self, label, output, num_view): unit = self.batch_size / num_view if self.cost.shape[0] != unit: self.cost = gpuarray.zeros((unit, 1), dtype = np.float32) maxid = gpuarray.zeros((self.batch_size, 1), dtype = np.float32) find_col_max_id(maxid, output) self.batchCorrect = same_reduce_multiview(label, maxid, num_view) tmp = gpuarray.zeros((output.shape[0], unit), dtype = np.float32) gpu_partial_copy_to(output, tmp, 0, output.shape[0], 0, unit) logreg_cost_col_reduce(tmp, label, self.cost)
def fprop(self, input, output, train=TRAIN): max = gpuarray.zeros((1, self.batchSize), dtype=np.float32) col_max_reduce(max, input) add_vec_to_cols(input, max, output, alpha= -1) eltwise_exp(output) sum = gpuarray.zeros(max.shape, dtype=np.float32) add_col_sum_to_vec(sum, output, alpha=0) div_vec_to_cols(output, sum) if PFout: print_matrix(output, self.name)
def __init__(self, A1, A2, left, use_batch=False): """Creates a new LinearOperator interface to the superoperator E. This is a wrapper to be used with SciPy's sparse linear algebra routines. Parameters ---------- A1 : ndarray Ket parameter tensor. A2 : ndarray Bra parameter tensor. left : bool Whether to multiply with a vector to the left (or to the right). """ self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.use_batch = use_batch self.left = left self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = sp.dtype(A1[0][0].dtype) self.calls = 0 self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.ones = [one.fill(1) for one in self.ones] self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def get_next_batch(self, batch_size): if self._reader is None: self._start_read() if self._gpu_batch is None: self._fill_reserved_data() if not self.multiview: height, width = self._gpu_batch.data.shape gpu_data = self._gpu_batch.data gpu_labels = self._gpu_batch.labels epoch = self._gpu_batch.epoch if self.index + batch_size >= width: width = width - self.index labels = gpu_labels[self.index:self.index + batch_size] data = gpuarray.zeros((height, width), dtype = np.float32) gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + width) self.index = 0 self._fill_reserved_data() else: labels = gpu_labels[self.index:self.index + batch_size] data = gpuarray.zeros((height, batch_size), dtype = np.float32) gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + batch_size) self.index += batch_size else: # multiview provider # number of views should be 10 # when using multiview, do not pre-move data and labels to gpu height, width = self._cpu_batch.data.shape cpu_data = self._cpu_batch.data cpu_labels = self._cpu_batch.labels epoch = self._cpu_batch.epoch width /= self.num_view if self.index + batch_size >= width: batch_size = width - self.index labels = cpu_labels[self.index:self.index + batch_size] data = np.zeros((height, batch_size * self.num_view), dtype = np.float32) for i in range(self.num_view): data[:, i* batch_size: (i+ 1) * batch_size] = cpu_data[:, self.index + width * i : self.index + width * i + batch_size] data = copy_to_gpu(np.require(data, requirements = 'C')) labels = copy_to_gpu(np.require(labels, requirements = 'C')) self.index = (self.index + batch_size) / width #util.log_info('Batch: %s %s %s', data.shape, gpu_labels.shape, labels.shape) return BatchData(data, labels, epoch)
def cuda_hogbom(gpu_dirty,gpu_dpsf,gpu_cpsf,thresh=0.2,damp=1,gain=0.1,prefix='test'): """ Use CUDA to implement the Hogbom CLEAN algorithm A nice description of the algorithm is given by the NRAO, here: http://www.cv.nrao.edu/~abridle/deconvol/node8.html Parameters: * dirty: The dirty image (2D numpy array) * dpsf: The dirty beam psf (2D numpy array) * thresh: User-defined threshold to stop iteration, as a fraction of the max pixel intensity (float) * damp: The damping factor to scale the dirty beam by * prefix: prefix for output image file names """ height,width=np.shape(gpu_dirty) ## Grid parameters - #improvable# tsize=8 blocksize = (int(tsize),int(tsize),1) # The number of threads per block (x,y,z) gridsize = (int(width/tsize),int(height/tsize)) # The number of thread blocks (x,y) ## Setup cleam image and point source model gpu_pmodel = gpu.zeros([height,width],dtype=np.float32) gpu_clean = gpu.zeros([height,width],dtype=np.float32) ## Setup GPU constants gpu_max_id = gpu.to_gpu(np.int32(0)) imax=gpu_getmax(gpu_dirty) thresh_val=np.float32(thresh*imax) ## Steps 1-3 - Iterate until threshold has been reached t_start=time.time() i=0 while abs(imax)>(thresh_val): if (np.mod(i,100)==0): print "Hogbom iteration",i ## Step 1 - Find max find_max_kernel(gpu_dirty,gpu_max_id,imax,np.int32(width),np.int32(height),gpu_pmodel,\ block=blocksize, grid=gridsize) ## Step 2 - Subtract the beam (assume that it is normalized to have max 1) ## This kernel simultaneously reconstructs the CLEANed image. if PLOTME: print "Subtracting dirty beam "+str(i)+", maxval=%0.8f"%imax+' at x='+str(gpu_max_id.get()%width)+\ ', y='+str(gpu_max_id.get()/width) sub_beam_kernel(gpu_dirty,gpu_dpsf,gpu_max_id,gpu_clean,gpu_cpsf,np.float32(gain*imax),np.int32(width),\ np.int32(height), block=blocksize, grid=gridsize) i+=1 ## Step 3 - Find maximum value using gpuarray imax=gpu_getmax(gpu_dirty) t_end=time.time() t_full=t_end-t_start print "Hogbom execution time %0.5f"%t_full+' s' print "\t%0.5f"%(t_full/i)+' s per iteration' ## Step 4 - Add the residuals back in add_noise_kernel(gpu_dirty,gpu_clean,np.float32(width+height)) return gpu_dirty,gpu_pmodel,gpu_clean
def __init__(self,**params): ''' Hack-ish way to avoid initialisation until the weights are transfered: ''' should_apply = self.apply_output_fns_init params['apply_output_fns_init'] = False super(GPUSparseCFProjection,self).__init__(**params) # Transfering the weights: self.pycuda_stream = cuda.Stream() self.weights_gpu = cusparse.CSR.to_CSR(self.weights.toSparseArray().transpose()) # Getting the row and columns indices for the *transposed* matrix. Used for Hebbian learning and normalisation: nzcols, nzrows = self.weights.nonzero() tups = sorted(zip(nzrows, nzcols)) nzrows = [x[0] for x in tups] nzcols = [x[1] for x in tups] ''' Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the main memory without the involvment of the CPU: ''' self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32) self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED) self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32) self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) # Getting them on the GPU: self.nzcount = self.weights.getnnz() self.nzrows_gpu = gpuarray.to_gpu(np.array(nzrows, np.int32)) self.nzcols_gpu = gpuarray.to_gpu(np.array(nzcols, np.int32)) # Helper array for normalization: self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32)) # Kernel that applies the normalisation: self.normalize_kernel = ElementwiseKernel( "int *nzrows, float *norm_total, float *weights", "weights[i] *= norm_total[nzrows[i]]", "divisive_normalize") # Kernel that calculates the learning: self.hebbian_kernel = ElementwiseKernel( "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result", "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]", "hebbian_learning") params['apply_output_fns_init'] = should_apply self.apply_output_fns_init = should_apply if self.apply_output_fns_init: self.apply_learn_output_fns()
def __init__(self, p, A1, A2, l=None, r=None, left=False, pseudo=True, use_batch=False): assert not (pseudo and (l is None or r is None)), 'For pseudo-inverse l and r must be set!' self.use_batch = use_batch self.p = p self.left = left self.pseudo = pseudo self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = A1[0].dtype self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.l = l self.r = r self.lG = garr.to_gpu(sp.asarray(l)) self.rG = garr.to_gpu(sp.asarray(r)) self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.out2 = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) self.out2_p = get_batch_ptrs([self.out2] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.out2_p = None self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.ones = [one.fill(1) for one in self.ones] self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def append_layer(self, layer): self.layers.append(layer) if layer.type == 'conv': self.numConv += 1 outputShape = layer.get_output_shape() row = outputShape[0] * outputShape[1] * outputShape[2] col = outputShape[3] self.inputShapes.append((row, col)) self.imgShapes.append(outputShape) self.outputs.append(gpuarray.zeros((row, col), dtype=np.float32)) self.grads.append(gpuarray.zeros(self.inputShapes[-2], dtype=np.float32)) print >> sys.stderr, '%s[%s]:%s' % (layer.name, layer.type, outputShape)
def __init__(self,n_units,n_incoming,N,init_sd=1.0,precision=np.float32,magic_numbers=False): self.n_units = n_units self.n_incoming = n_incoming self.N = N w = np.random.normal(0,init_sd,(self.n_incoming,self.n_units)) b = np.random.normal(0,init_sd,(1,n_units)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) # Prior and ID must be set after creation self.prior = -1 self.ID = -1 self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape)) self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0 self.precision = precision self.outputs = gpuarray.zeros((self.N,self.n_units),precision) self.magic_numbers = magic_numbers #Define tan_h function on GPU if magic_numbers: self.tanh = ElementwiseKernel( "float *x", "x[i] = 1.7159 * tanh(2/3*x[i]);", "tan_h",preamble="#include <math.h>") else: self.tanh = ElementwiseKernel( "float *x", "x[i] = tanh(min(max(-10.0,x[i]),10.0));", "tan_h",preamble="#include <math.h>") #Compile kernels kernels = SourceModule(open(path+'/kernels.cu', "r").read()) self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list()
def __init__(self,**params): #Hack-ish way to avoid initialisation until the weights are transfered: should_apply = self.apply_output_fns_init params['apply_output_fns_init'] = False super(GPUSparseCFProjection,self).__init__(**params) # The sparse matrix is stored in COO format, used for Hebbian learning and normalisation: nzcols, nzrows, values = self.weights.getTriplets() tups = sorted(zip(nzrows, nzcols, values)) nzrows = np.array([x[0] for x in tups], np.int32) nzcols = np.array([x[1] for x in tups], np.int32) values = np.array([x[2] for x in tups], np.float32) # Getting them on the GPU: self.nzcount = self.weights.getnnz() self.nzrows_gpu = gpuarray.to_gpu(nzrows) self.nzcols_gpu = gpuarray.to_gpu(nzcols) # Setting the projection weights in CSR format for dot product calculation: rowPtr = cusparse.coo2csr(self.nzrows_gpu, self.weights.shape[1]) descrA = cusparse.cusparseCreateMatDescr() cusparse.cusparseSetMatType(descrA, cusparse.CUSPARSE_MATRIX_TYPE_GENERAL) cusparse.cusparseSetMatIndexBase(descrA, cusparse.CUSPARSE_INDEX_BASE_ZERO) self.weights_gpu = cusparse.CSR(descrA, values, rowPtr, self.nzcols_gpu, (self.weights.shape[1], self.weights.shape[0])) # Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the # main memory without the involvment of the CPU: self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32) self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED) self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32) self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) # Helper array for normalization: self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32)) # Kernel that applies the normalisation: self.normalize_kernel = ElementwiseKernel( "int *nzrows, float *norm_total, float *weights", "weights[i] *= norm_total[nzrows[i]]", "divisive_normalize") # Kernel that calculates the learning: self.hebbian_kernel = ElementwiseKernel( "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result", "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]", "hebbian_learning") self.pycuda_stream = cuda.Stream() # Finishing the initialisation that might have been delayed: params['apply_output_fns_init'] = should_apply self.apply_output_fns_init = should_apply if self.apply_output_fns_init: self.apply_learn_output_fns()
def reshape(self, bottom, top): with pu.caffe_cuda_context(): batch_size = bottom[0].shape[0] if self.batch_size_ != batch_size: self.batch_size_ = batch_size self.diff_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.diff2_sum_ = gpuarray.zeros((batch_size, 1), dtype) self.mask_sum_ = gpuarray.zeros((batch_size, 1), dtype) dim = int(np.prod(bottom[0].shape[1:])) if self.dim_ != dim: self.dim_ = dim self.multipier_sum_ = gpuarray.zeros((dim, 1), dtype) self.multipier_sum_.fill(dtype(1.0)) top[0].reshape()
def append_layer(self, layer): self.layers.append(layer) if layer.type == 'conv': self.numConv += 1 outputShape = layer.get_output_shape() row = outputShape[1] * outputShape[2] * outputShape[3] col = outputShape[0] self.inputShapes.append((row, col)) self.imgShapes.append(outputShape) self.outputs.append(gpuarray.zeros((row, col), dtype=np.float32)) self.grads.append(gpuarray.zeros(self.inputShapes[-2], dtype=np.float32)) print >> sys.stderr, 'append a', layer.type, 'layer', layer.name, 'to network' print >> sys.stderr, 'the output of the layer is', outputShape
def __init__( self, s_dict, synapse_state, dt, debug=False): self.debug = debug self.dt = dt self.num = len( s_dict['id'] ) self.pre = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 )) self.ar = garray.to_gpu( np.asarray( s_dict['ar'], dtype=np.float64 )) self.ad = garray.to_gpu( np.asarray( s_dict['ad'], dtype=np.float64 )) self.gmax = garray.to_gpu( np.asarray( s_dict['gmax'], dtype=np.float64 )) self.a0 = garray.zeros( (self.num,), dtype=np.float64 ) self.a1 = garray.zeros( (self.num,), dtype=np.float64 ) self.a2 = garray.zeros( (self.num,), dtype=np.float64 ) self.cond = synapse_state self.update = self.get_gpu_kernel()
def _allocate_arrays(self): #allocate gpu arrays and numpy arrays. if self.max_features < 4: imp_size = 4 else: imp_size = self.max_features #allocate gpu arrays self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32) self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32) self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts) self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices) self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, self.dtype_indices) self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32) self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts) self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16) self.mark_table = gpuarray.empty(self.stride, np.uint8) #allocate numpy arrays self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32) self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8) self.nid_array = np.zeros(self.n_samples, dtype = np.uint32) self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices) self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8) self.threshold_value_idx = np.zeros(2, self.dtype_indices) self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32) self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16) self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
def __init__(self, name, input_shape): Layer.__init__(self, name, "softmax") self.inputShape = input_shape self.inputSize, self.batchSize = input_shape self.outputSize = self.inputSize self.cost = gpuarray.zeros((self.batchSize, 1), dtype=np.float32) self.batchCorrect = 0
def __init__(self, n_in, n_out, parameters=None, weights_scale=None, l1_penalty_weight=0., l2_penalty_weight=0., lr_multiplier=None, test_error_fct='class_error'): # Initialize weight using Bengio's rule self.weights_scale = 4 * sqrt(6. / (n_in + n_out)) \ if weights_scale is None \ else weights_scale if parameters is not None: self.W, self.b = parameters else: self.W = gpuarray.empty((n_in, n_out), dtype=np.float32, allocator=memory_pool.allocate) sampler.fill_uniform(self.W) self.W = self.weights_scale * (self.W - .5) self.b = gpuarray.zeros((n_out,), dtype=np.float32) self.n_in = n_in self.n_out = n_out self.test_error_fct = test_error_fct self.l1_penalty_weight = l1_penalty_weight self.l2_penalty_weight = l2_penalty_weight self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \ if lr_multiplier is None else lr_multiplier
def __init__(self, n_in, n_out, parameters=None, weights_scale=None, l1_penalty_weight=0., l2_penalty_weight=0., lr_multiplier=None): # Initialize weight using Bengio's rule self.weights_scale = 4 * sqrt(6. / (n_in + n_out)) \ if weights_scale is None \ else weights_scale if parameters is not None: self.W, self.b = parameters else: self.W = self.weights_scale * \ sampler.gen_uniform((n_in, n_out), dtype=np.float32) \ - .5 * self.weights_scale self.b = gpuarray.zeros((n_out,), dtype=np.float32) self.n_in = n_in self.n_out = n_out self.l1_penalty_weight = l1_penalty_weight self.l2_penalty_weight = l2_penalty_weight self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \ if lr_multiplier is None else lr_multiplier
def arr_pad(x, dims): """Basically zeropadding an array to ``dims`` dimensions. Implemented as follows: Write a smaller array into a bigger one. The bigger array will be created according to ``dims``. The place of the smaller matrix will be in the upper left corner of the bigger array. Args: x (gpuarray): Input array. dims (tuple): Dimensions of the bigger array. Returns: gpuarray: Output array of size `dims` with `x` in the upper left corner. """ out = gpuarray.zeros(dims, x.dtype) arr_pad_func(x, out, np.int32(x.shape[0]), np.int32(dims[0])) return out
def _forward(self, m, v_k): """Forward Operator ``E^H E`` Args: m (gpuarray): Input array. v_k (gpuarray): Output array. """ tmp = gpuarray.zeros(self._dest_shape, dtype=self._op.precision_complex) self._op.apply(m, tmp) self._op.adjoint(tmp, v_k) # v_k = v_k + bla.*m if self._double: add_scaled_vector_vector_double(v_k, v_k, self._weights, m) else: add_scaled_vector_vector(v_k, v_k, self._weights, m) tmp.gpudata.free()
def test_take_put(self): for n in [5, 17, 333]: one_field_size = 8 buf_gpu = gpuarray.zeros(n * one_field_size, dtype=np.float32) dest_indices = gpuarray.to_gpu( np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) read_map = gpuarray.to_gpu( np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) gpuarray.multi_take_put( arrays=[buf_gpu for i in range(n)], dest_indices=dest_indices, src_indices=read_map, src_offsets=[i * one_field_size for i in range(n)], dest_shape=(96, ), ) drv.Context.synchronize()
def TileWDenom(WDenomIn, M): """ :param WDenomIn: A T x 1 x K array that needs to be tiled :param M: Dimension of tile axis :returns: WDenomOut: A T x M x K tiled array """ blockdim = 32 T = WDenomIn.shape[0] K = WDenomIn.shape[2] GridDimT = int(np.ceil(1.0 * T / blockdim)) GridDimK = int(np.ceil(1.0 * K / blockdim)) T = np.array(T, dtype=np.int32) M = np.array(M, dtype=np.int32) K = np.array(K, dtype=np.int32) WDenomOut = gpuarray.zeros((T, M, K), np.float32) TileWDenom_(WDenomIn, WDenomOut, T, M, K, block=(blockdim, blockdim, 1), \ grid=(GridDimT, GridDimK)) return WDenomOut
def TileHDenom(HDenomIn, N): """ :param HDenomIn: A F x K x 1 array that needs to be tiled :param N: Dimension of tile axis :returns: HDenomOut: A F x K x N tiled array """ blockdim = 32 F = HDenomIn.shape[0] K = HDenomIn.shape[1] GridDimF = int(np.ceil(1.0 * F / blockdim)) GridDimK = int(np.ceil(1.0 * K / blockdim)) F = np.array(F, dtype=np.int32) K = np.array(K, dtype=np.int32) N = np.array(N, dtype=np.int32) HDenomOut = gpuarray.zeros((F, K, N), np.float32) TileHDenom_(HDenomIn, HDenomOut, F, K, N, block=(blockdim, blockdim, 1), \ grid=(GridDimF, GridDimK)) return HDenomOut
def test_cublasDgetrfBatched(self): from scipy.linalg import lu_factor l, m = 11, 7 A = np.random.rand(l, m, m).astype(np.float64) A = np.array([np.matrix(a) * np.matrix(a).T for a in A]) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) p_gpu = gpuarray.empty((l, m), np.int32) i_gpu = gpuarray.zeros(1, np.int32) X = np.array([lu_factor(a)[0] for a in A]) cublas.cublasDgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) X_ = np.array([a.T for a in a_gpu.get()]) assert np.allclose(X, X_)
def preclean(self): nx = np.int32(2 * self.imsize) # create fft plan nx*nx self.plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64) d_dirty = gpu.zeros((np.int(self.imsize), np.int(self.imsize)), np.float32) gpu_im = self.cuda_gridvis(self.plan, 0, 0) dirty = gpu_im.get() if self.Debug: logger.debug("Plotting dirty image") if self.plot_me: pathPrefix = self.outdir prefix = self.uvfile prefix, ext = os.path.splitext(os.path.basename(prefix)) if pathPrefix == None: filename = prefix + '_dirty_%dp.png' % self.chan fitsfile = prefix + '_dirty_%dp.fit' % self.chan else: if pathPrefix[-1:] == '/': pathPrefix = pathPrefix[:-1] filename = pathPrefix + '/' + prefix + '_dirty_%dp.png' % self.chan fitsfile = pathPrefix + '/' + prefix + '_dirty_%dp.fit' % self.chan self.muser_draw.draw_one(filename, self.title, self.fov, dirty, self.ra - 0.5, self.ra + 0.5, self.dec - 0.5, self.dec + 0.5, 16.1, axis=False, axistype=0) if self.writefits: self.write_fits(dirty, fitsfile, 'DIRTY_IMAGE') return filename
def __init__(self, n_in, parameters=None, weights_scale=None, l1_penalty_weight=0., l2_penalty_weight=0., lr_multiplier=None, test_error_fct='class_error'): # Initialize weight using Bengio's rule self.weights_scale = 4 * sqrt(6. / (n_in + 1)) \ if weights_scale is None \ else weights_scale if parameters is not None: self.W, self.b = parameters else: self.W = self.weights_scale * \ sampler.gen_uniform((n_in, 1), dtype=np.float32) \ - .5 * self.weights_scale self.b = gpuarray.zeros((1, ), dtype=np.float32) self.n_in = n_in self.test_error_fct = test_error_fct self.l1_penalty_weight = l1_penalty_weight self.l2_penalty_weight = l2_penalty_weight self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \ if lr_multiplier is None else lr_multiplier self.persistent_temp_objects_config = (('activations', ('batch_size', 1), np.float32), ('df_W', self.W.shape, np.float32), ('df_b', self.b.shape, np.float32), ('df_input', ('batch_size', self.n_in), np.float32), ('delta', ('batch_size', 1), np.float32))
def test_adjoint(self, iters=5): """Test the adjoint operator. Args: iters (int): number of iterations """ src_shape = (self.data.nX1, self.data.nX2, 1) dest_shape = (self.data.nT, self.data.nC) u = gpuarray.zeros(src_shape, self.precision_complex, order='F') ut = gpuarray.zeros(src_shape, self.precision_real, order='F') Ku = gpuarray.zeros(dest_shape, self.precision_complex, order='F') v = gpuarray.zeros(dest_shape, self.precision_complex, order='F') vt = gpuarray.zeros(dest_shape, self.precision_real, order='F') Kadv = gpuarray.zeros(src_shape, self.precision_complex, order='F') generator = curandom.XORWOWRandomNumberGenerator() errors = [] try: i = 0 for i in range(iters): # randomness generator.fill_uniform(ut) generator.fill_uniform(vt) v = gpuarray_copy(vt.astype(self.precision_complex)) u = gpuarray_copy(ut.astype(self.precision_complex)) # apply operators self.apply(u, Ku) self.adjoint(v, Kadv) scp1 = dotc_gpu(Ku, v) scp2 = dotc_gpu(u, Kadv) n_Ku = dotc_gpu(Ku) n_Kadv = dotc_gpu(Kadv) n_u = dotc_gpu(u) n_v = dotc_gpu(v) errors.append(np.abs(scp1-scp2)) print("Test " + str(i) + ": <Ku,v>=" + str(scp1) + ", <u,Kadv>=" + str(scp2) + ", Error=" + str(np.abs(scp1-scp2)) + ", Relative Error=" + str((scp1-scp2)/(n_Ku*n_v + n_Kadv*n_u))) except KeyboardInterrupt: if len(errors) == 0: errors = -1 finally: print("Mean Error: " + repr(np.mean(errors))) print("Standarddeviation: " + repr(np.std(errors))) return i
def test_complex_bits(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.complex64, np.complex128] else: dtypes = [np.complex64] n = 20 for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) z = curand((n,), real_dtype).astype(dtype) + 1j * curand( (n,), real_dtype ).astype(dtype) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 # verify conj with out parameter z_out = z.astype(np.complex64) assert z_out is z.conj(out=z_out) assert la.norm(z.get().conj() - z_out.get()) < 1e-7 # verify contiguity is preserved for order in ["C", "F"]: # test both zero and non-zero value code paths z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order) z2 = z.reshape(z.shape, order=order) for zdata in [z_real, z2]: if order == "C": assert zdata.flags.c_contiguous assert zdata.real.flags.c_contiguous assert zdata.imag.flags.c_contiguous assert zdata.conj().flags.c_contiguous elif order == "F": assert zdata.flags.f_contiguous assert zdata.real.flags.f_contiguous assert zdata.imag.flags.f_contiguous assert zdata.conj().flags.f_contiguous
def __init__(self, params_dict, access_buffers, dt, debug=False, LPU_id=None, cuda_verbose=True): if cuda_verbose: self.compile_options = ['--ptxas-options=-v'] else: self.compile_options = [] self.num_comps = params_dict['dummy'].size self.params_dict = params_dict self.access_buffers = access_buffers self.debug = debug self.LPU_id = LPU_id self.dtype = params_dict['dummy'].dtype self.dt = np.double(dt) self.ddt = np.double(1e-6) self.steps = np.int32(max(int(self.dt / self.ddt), 1)) self.internal_states = { c: garray.zeros(self.num_comps, dtype = self.dtype)+self.internals[c] \ for c in self.internals} self.inputs = { k: garray.empty(self.num_comps, dtype = self.access_buffers[k].dtype)\ for k in self.accesses} dtypes = {'dt': self.dtype} dtypes.update({k: self.inputs[k].dtype for k in self.accesses}) dtypes.update({k: self.params_dict[k].dtype for k in self.params}) dtypes.update( {k: self.internal_states[k].dtype for k in self.internals}) dtypes.update({ k: self.dtype if not k == 'spike_state' else np.int32 for k in self.updates }) self.update_func = self.get_update_func(dtypes)
def __call__(self, x, y=None): if y is None: y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator) self.get_kernel().prepared_call( (self.block_count, 1), (self.threads_per_packet, 1, 1), self.packet_base_rows.gpudata, self.thread_starts.gpudata, self.thread_ends.gpudata, self.index_array.gpudata, self.data_array.gpudata, x.gpudata, y.gpudata) self.remaining_coo_gpu(x, y) return y
def test_large_smem(self): n = 4000 mod = SourceModule(""" #include <stdio.h> __global__ void kernel(int *d_data) { __shared__ int sdata[%d]; sdata[threadIdx.x] = threadIdx.x; d_data[threadIdx.x] = sdata[threadIdx.x]; } """ % n) kernel = mod.get_function("kernel") import pycuda.gpuarray as gpuarray arg = gpuarray.zeros((n,), dtype=np.float32) kernel(arg, block=(1,1,1,), )
def _pre_run(self): assert(self.LPU_obj) assert(all([var in self.memory_manager.variables for var in self.variables.keys()])) for var, d in self.variables.items(): v_dict = self.memory_manager.variables[var] uids = [] inds = [] for uid in d['uids']: cd = self.LPU_obj.conn_dict[uid] assert(var in cd) pre = cd[var]['pre'][0] inds.append(v_dict['uids'][pre]) self.dest_inds[var] = garray.to_gpu(np.array(inds,np.int32)) self.dtypes[var] = v_dict['buffer'].dtype self._d_input[var] = garray.zeros(len(d['uids']),self.dtypes[var]) self.variables[var]['input'] = np.zeros(len(d['uids']), self.dtypes[var]) self.pre_run()
def __init__(self, mesh, context=None): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension ''' # create the mesh grid and compute the greens function on it self.mesh = mesh self._context = context mesh_shape2 = [2*n for n in mesh.shape] # 2*nz, 2*ny, (2*nx) self.tmpspace = gpuarray.zeros(mesh_shape2, dtype=np.complex128) self.fgreentr = gpuarray.empty_like(self.tmpspace) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch dim = mesh.dimension self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd') self._mirror = getattr(self, '_mirror' + str(dim) + 'd') copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=mesh.nx*sizeof_complex, dst_pitch=2*mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=mesh.ny, dst_height=2*mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet know src_pitch=2*mesh.nx*sizeof_complex, dst_pitch=mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*mesh.ny, dst_height=mesh.ny) self.plan_forward = cu_fft.Plan( self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) self.plan_backward = self.plan_forward self.setup_mesh(mesh)
def __init__( self, params_dict, access_buffers, dt, LPU_id=None, debug=False, cuda_verbose=False): if cuda_verbose: self.compile_options = ['--ptxas-options=-v'] else: self.compile_options = [] self.debug = debug self.dt = dt self.num = params_dict['g_max'].size self.LPU_id = LPU_id self.params_dict = params_dict self.access_buffers = access_buffers self.inputs = {} self.inputs['V'] = garray.zeros( (self.num), dtype=np.float64 ) print(self.accesses) self.update = self.get_gpu_kernel(params_dict['g_max'].dtype)
def multiply(a, b, out=None, increment=False, stream=None): """Element-wise product of `a` and `b`.""" dtype = a.dtype if out is None: out = gpuarray.zeros(a.shape, dtype=dtype) assert a.size == b.size assert a.dtype == b.dtype == out.dtype block = (min(a._block[0], a.size), 1, 1) grid = (a.size // block[0] + (a.size % block[0] != 0), 1, 1) hf.gpu.multiply_kernel[dtype == np.float32].prepared_async_call( grid, block, stream, a.gpudata, b.gpudata, out.gpudata, np.int32(a.size), np.int32(increment)) return out
def _lmadata(self): if not hasattr(self, '__lmadata'): nentries = 0 # dense block of rmap.arity x cmap.arity for each rmap/cmap pair for rmap, cmap in self.sparsity.maps: nentries += rmap.arity * cmap.arity entry_size = 0 # all pairs of maps in the sparsity must have the same # iterset, there are sum(iterset.size) * nentries total # entries in the LMA data for rmap, cmap in self.sparsity.maps: entry_size += rmap.iterset.size # each entry in the block is size dims[0] x dims[1] entry_size *= np.asscalar(np.prod(self.dims)) nentries *= entry_size setattr(self, '__lmadata', gpuarray.zeros(shape=nentries, dtype=self.dtype)) return getattr(self, '__lmadata')
def slice_coil(inp, outp=None, coil=0): """Returns a slice of a 3D-Array (image stack or coil sensitivity) since slicing is not implemented in PyCUDA. Args: inp (gpuarray): Input array. outp (gpuarray): Output slice (optional, if not provided, it will be created). coil (int): Coil index. Returns: gpuarray: Output array. """ dim = inp.shape[0] n_coils = inp.shape[1] if outp is None: outp = gpuarray.zeros(dim, inp.dtype) slice_coil_func(outp, inp, np.int32(coil), np.int32(n_coils)) return outp
def __init__(self, n_in, n_units, activation_function='sigmoid', dropout=False, parameters=None, weights_scale=None, l1_penalty_weight=0., l2_penalty_weight=0., lr_multiplier=None): self._set_activation_fct(activation_function) if weights_scale is None: self._set_weights_scale(activation_function, n_in, n_units) else: self.weights_scale = weights_scale if parameters is not None: if isinstance(parameters, basestring): self.parameters = cPickle.loads(open(parameters)) else: self.W, self.b = parameters else: self.W = self.weights_scale * \ sampler.gen_uniform((n_in, n_units), dtype=np.float32) \ - .5 * self.weights_scale self.b = gpuarray.zeros((n_units,), dtype=np.float32) assert self.W.shape == (n_in, n_units) assert self.b.shape == (n_units,) self.n_in = n_in self.n_units = n_units self.lr_multiplier = lr_multiplier if lr_multiplier is not None else \ 2 * [1. / np.sqrt(self.n_in, dtype=np.float32)] self.l1_penalty_weight = l1_penalty_weight self.l2_penalty_weight = l2_penalty_weight self.dropout = dropout
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } ''' myKern = myKern.replace('fpName',fpName_str) myKern = myKern.replace('cuPres',prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8,8,8) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,npoints) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 )) copy_texture.prepare('P',texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def crop_stack_GPU(x, sz, offset=(0,0), dtype='real'): if x.__class__ == np.ndarray: x = np.array(x).astype(np.float32) x_gpu = cua.to_gpu(x) elif x.__class__ == cua.GPUArray: x_gpu = x sx = x_gpu.shape block_size = (16,16,1) grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])), int(np.ceil(np.float32(sz[1])/block_size[1]))) sx_before = np.array([sx[1],sx[2]]) sx_after = np.array(sz) if any(np.array([sx[1],sx[2]])-(np.array(sz))<offset): raise IOError('Size missmatch: Size after - size before < offset') if dtype == 'real': if x_gpu.dtype != np.float32: x_gpu = x_gpu.real mod = cu.module_from_buffer(cubin) crop_stack_Kernel = mod.get_function("crop_stack_Kernel") xc_gpu = cua.zeros(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32) if dtype == 'complex': mod = cu.module_from_buffer(cubin) crop_stack_Kernel = mod.get_function("crop_stack_ComplexKernel") xc_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64) crop_stack_Kernel(xc_gpu.gpudata, np.int32(sx[0]), np.int32(sz[0]), np.int32(sz[1]), x_gpu.gpudata, np.int32(sx[1]), np.int32(sx[2]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return xc_gpu
def varianza_cov(R_s, G_s, B_s): kernel_code = kernel_var_cov % {'BLOCK_SIZE': BLOCK_SIZE} mod = compiler.SourceModule(kernel_code) covariance_kernel = mod.get_function("CovarianceKernel") salida_gpu = gpuarray.zeros((3, 3), np.float32) Rs_gpu = gpuarray.to_gpu(R_s) Gs_gpu = gpuarray.to_gpu(G_s) Bs_gpu = gpuarray.to_gpu(B_s) for i in range(len(R_s)): covariance_kernel( # inputs Rs_gpu[i], Gs_gpu[i], Bs_gpu[i], # output salida_gpu, # block of multiple threads block=(32, 32, 1), ) return salida_gpu.get()
def test_ranged_elwise_kernel(self): from pycuda.elementwise import ElementwiseKernel set_to_seven = ElementwiseKernel("float *z", "z[i] = 7", "set_to_seven") for i, slc in enumerate([ slice(5, 20000), slice(5, 20000, 17), slice(3000, 5, -1), slice(1000, -1), ]): a_gpu = gpuarray.zeros((50000, ), dtype=np.float32) a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) a_cpu[slc] = 7 set_to_seven(a_gpu, slice=slc) drv.Context.synchronize() assert la.norm(a_cpu - a_gpu.get()) == 0, i
def __init__(self, obj, attrs, steps, **kwargs): super(CUDARecorder, self).__init__(obj, attrs, steps, **kwargs) gpu_buffer = kwargs.pop('gpu_buffer', False) if gpu_buffer: self.buffer_length = self._get_buffer_length(gpu_buffer) self.gpu_dct = {} for key in attrs: src = getattr(self.obj, key) shape = (self.buffer_length, src.size) self.gpu_dct[key] = garray.zeros(shape, dtype=src.dtype) self._update = self._copy_memory_dtod else: self._update = self._copy_memory_dtoh if PY2: self.get_buffer = self._py2_get_buffer if PY3: self.get_buffer = self._py3_get_buffer
def __calc_A_shift_gpu(self, shift_x, shift_y): psis_gpu = self.converter.get_prolates_as_images( ) # TODO: need to assert that returns indeed a gpuarray n_psis = len(psis_gpu) if shift_x == 0 and shift_y == 0: return np.eye(n_psis) A_shift = gpuarray.zeros((n_psis, n_psis), 'complex64') non_neg_freqs = self.converter.get_non_neg_freq_inds() psis_gpu_non_neg_freqs = psis_gpu[non_neg_freqs] psis_non_neg_shifted = circ_shift_kernel.circ_shift( psis_gpu_non_neg_freqs, shift_x, shift_y) psis_non_neg_shifted = self.converter.mask_points_inside_the_circle( psis_non_neg_shifted) psis_non_neg_shifted = psis_non_neg_shifted.reshape( len(psis_non_neg_shifted), -1) psis_gpu = psis_gpu.reshape(n_psis, -1) A_shift[non_neg_freqs] = linalg.dot(psis_non_neg_shifted, psis_gpu, transb='C') zero_freq_inds = self.converter.get_zero_freq_inds() pos_freq_inds = self.converter.get_pos_freq_inds() neg_freq_inds = self.converter.get_neg_freq_inds() A_shift[neg_freq_inds, zero_freq_inds] = A_shift[pos_freq_inds, zero_freq_inds] A_shift[neg_freq_inds, pos_freq_inds] = A_shift[pos_freq_inds, neg_freq_inds] A_shift[neg_freq_inds, neg_freq_inds] = A_shift[pos_freq_inds, pos_freq_inds] A_shift[neg_freq_inds] = linalg.conj(A_shift[neg_freq_inds]) # TODO: get rid of the transpose # return np.transpose(A_shift).copy() return np.transpose(A_shift).get().copy()
def getPatches(complexDataset, patchSize = 5, echoes = 6, patchSpacing = 1): PATCHSIZE=patchSize ECHOES=echoes extract_patches = getCudaFunction(patchSize, echoes, patchSpacing) blockSize = complexDataset.shape[1] gridSize = complexDataset.shape[0] realDataset = complexDataset.real.astype(np.float32).flatten() imDataset = complexDataset.imag.astype(np.float32).flatten() # free, total = drv.mem_get_info() # print '%.1f %% of device memory is free before alloc.' % ((free/float(total))*100) #patchArray = np.zeros([blockSize*gridSize*2*(2*PATCHSIZE+1)*(2*PATCHSIZE+1)*ECHOES], dtype=np.float32) real_gpu = ga.to_gpu(realDataset) im_gpu = ga.to_gpu(imDataset) out_gpu = ga.zeros([blockSize*gridSize*2*(2*PATCHSIZE+1)*(2*PATCHSIZE+1)*ECHOES], np.float32) #extract_patches(drv.Out(patchArray), drv.In(realDataset), drv.In(imDataset), block=(blockSize,1,1), grid=(gridSize,1)) extract_patches(out_gpu, real_gpu, im_gpu, block=(blockSize,1,1), grid=(gridSize,1)) # free, total = drv.mem_get_info() # print '%.1f %% of device memory is free after processing.' % ((free/float(total))*100) patchArray = out_gpu.get() real_gpu.gpudata.free() im_gpu.gpudata.free() out_gpu.gpudata.free() # free, total = drv.mem_get_info() # print '%.1f %% of device memory is free after dealloc.' % ((free/float(total))*100) patchArray = patchArray.reshape([blockSize*gridSize, (2*PATCHSIZE+1), (2*PATCHSIZE+1), ECHOES, 2]) return patchArray
def __init__( self, s_dict, synapse_state, dt, debug=False, cuda_verbose = False): if cuda_verbose: self.compile_options = ['--ptxas-options=-v'] else: self.compile_options = [] self.debug = debug #self.dt = dt self.num = len( s_dict['id'] ) if s_dict.has_key( 'delay' ): self.delay = garray.to_gpu(np.round(np.asarray( s_dict['delay'])*1e-3/dt ).astype(np.int32) ) else: self.delay = garray.zeros( self.num, dtype=np.int32 ) self.pre = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 )) self.state = synapse_state self.update = self.get_gpu_kernel()
def __init__(self, layer_pre, map_num, threshold, a_plus, a_minus, learning_rounds): super().__init__(layer_pre, (layer_pre.width, layer_pre.height), 1, map_num, threshold) self.a_plus = np.float32(a_plus) self.a_minus = np.float32(a_minus) self.learning_rounds = learning_rounds self.plastic = gpuarray.zeros(shape=(1, ), dtype=np.bool) self.weights = gpuarray.to_gpu( np.random.normal( 0.8, 0.01, (self.layer_size * self.layer_pre.layer_size, )).astype( np.float32)) self.g = gpuarray.to_gpu( np.arange(self.layer_size * self.layer_pre.layer_size).reshape( (self.layer_size, self.layer_pre.layer_size)).transpose().astype(np.int32)) self.label = gpuarray.empty(shape=(1, ), dtype=np.int32) self.reset()
def norm_est(self, u, iters=10): """Estimates norm of the operator with a power iteration. Args: u (gpuarray): input array iters (int): number of iterations """ if self._verbose: print("Estimating Norm...") u_temp = gpuarray_copy(u) result = gpuarray.zeros([self.data.nC, self.data.nT], self.precision_complex, order='F') for _ in range(0, iters): dot_tmp = dotc_gpu(u_temp) u_temp /= np.sqrt(np.abs(dot_tmp)) self.apply(u_temp, result) self.adjoint(result, u_temp) normsqr = dotc_gpu(u_temp) return np.sqrt(np.abs(normsqr)/self._norm_div)