def __init__(self , name, filter_shape, image_shape, padding = 2, stride = 1, initW = 0.01, initB = 0.0, epsW = 0.001, epsB = 0.002, bias = None, weight = None): Layer.__init__(self, name, 'conv') self.filterSize = filter_shape[2] self.numFilter = filter_shape[0] self.imgShape = image_shape self.batchSize, self.numColor, self.imgSize, _ = image_shape self.padding = padding self.stride = stride self.initW = initW self.initB = initB self.epsW = epsW self.epsB = epsB self.outputSize = 1 + int(((2 * self.padding + self.imgSize - self.filterSize) / float(self.stride))) self.modules = self.outputSize ** 2 if weight is None: self.filter = gpuarray.to_gpu(np.random.randn(self.filterSize * self.filterSize * self.numColor, self.numFilter) * self.initW).astype(np.float32) else: self.filter = gpuarray.to_gpu(weight).astype(np.float32) if bias is None: self.bias = gpuarray.to_gpu(np.random.randn(self.numFilter, 1) * initB).astype(np.float32) else: self.bias = gpuarray.to_gpu(bias).astype(np.float32) self.filterGrad = gpuarray.zeros_like(self.filter) self.biasGrad = gpuarray.zeros_like(self.bias)
def __init__(self, bend_coefs, N, QN, NON, NR, x_nd, K_nn, rot_coef, QN_gpu = None, WQN_gpu = None, NON_gpu = None, NHN_gpu = None): for b in bend_coefs: assert b in NON, 'no solver found for bending coefficient {}'.format(b) self.rot_coef = rot_coef self.n, self.d = x_nd.shape self.bend_coefs = bend_coefs self.N = N self.QN = QN self.NON = NON self.NR = NR self.x_nd = x_nd self.K_nn = K_nn ## set up GPU memory if QN_gpu is None: self.QN_gpu = gpuarray.to_gpu(self.QN) else: self.QN_gpu = QN_gpu if WQN_gpu is None: self.WQN_gpu = gpuarray.zeros_like(self.QN_gpu) else: self.WQN_gpu = WQN_gpu if NON_gpu is None: self.NON_gpu = {} for b in bend_coefs: self.NON_gpu[b] = gpuarray.to_gpu(self.NON[b]) else: self.NON_gpu = NON_gpu if NHN_gpu is None: self.NHN_gpu = gpuarray.zeros_like(self.NON_gpu[bend_coefs[0]]) else: self.NHN_gpu = NHN_gpu self.valid = True
def __init__(self, name, type, epsW, epsB, initW, initB, momW, momB, wc, weight, bias, weightIncr, biasIncr, weightShape, biasShape, disableBprop=False): Layer.__init__(self, name, type, disableBprop) self.epsW = F(epsW) self.epsB = F(epsB) self.initW = initW self.initB = initB self.momW = F(momW) self.momB = F(momB) self.wc = F(wc) if weight is None: self.weight = gpuarray.to_gpu( randn(weightShape, np.float32) * self.initW) else: print >> sys.stderr, 'init weight from disk' self.weight = gpuarray.to_gpu(weight) #.astype(np.float32) if bias is None: if self.initB > 0.0: self.bias = gpuarray.to_gpu( (np.ones(biasShape, dtype=np.float32) * self.initB)) else: self.bias = gpuarray.zeros(biasShape, dtype=np.float32) else: print >> sys.stderr, 'init bias from disk' self.bias = gpuarray.to_gpu(bias).astype(np.float32) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias) if self.momW > 0.0: if weightIncr is None: self.weightIncr = gpuarray.zeros_like(self.weight) else: print >> sys.stderr, 'init weightIncr from disk' #weightIncr = np.require(weightIncr, dtype = np.float, requirements = 'C') self.weightIncr = gpuarray.to_gpu(weightIncr) if self.momW > 0.0: if biasIncr is None: self.biasIncr = gpuarray.zeros_like(self.bias) else: print >> sys.stderr, 'init biasIncr from disk' #biasIncr = np.require(biasIncr, dtype = np.float, requirements = 'C') self.biasIncr = gpuarray.to_gpu(biasIncr)
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta): l, m, n = S.shape sum_alpha_beta = gpuarray.zeros_like(D_v) sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta) GR = grad(T) square_matrix(GR, GR) G_norm = gpuarray.zeros_like(T) sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm, 1.0, 1.0, 1.0) sqrt_matrix(G_norm, G_norm) # multiply_matrix(G_norm, _Gamma, G_norm) ET = _gamma_c * gpuarray.sum(G_norm) SP = gpuarray.zeros_like(S) absolute_matrix(S, SP) multiply_matrix(SP, _Lambda, SP) ES = gpuarray.sum(SP) sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n, 1) - sum_alpha_beta square_matrix(sparse, sparse) EL = gpuarray.sum(sparse) E = 1 / 2 * EL.get() + ES.get() + ET.get() return EL.get(), ES.get(), ET.get(), E
def __init__(self, volume, template, mask, wedge, stdV, gpu=True): self.volume = gu.to_gpu(volume) self.template = Volume(template) self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32) self.mask = Volume(mask) self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32) self.sOrg = mask.shape self.sPad = volume.shape print(self.sPad, self.sOrg) rotate(self.mask, [0, 0, 0], self.maskPadded, self.sPad, self.sOrg) #paste_in_center_gpu(self.template.d_data, self.templatePadded, np.int32(self.sPad), np.int32(self.maskSize), block=(10, 10, 10), grid=(8,1,1)) #rotate(self.template, [0, 0, 0], self.templatePadded, self.sPad, self.maskSize) print(volume.shape, stdV.shape, wedge.shape) self.wedge = gu.to_gpu(wedge) self.stdV = gu.to_gpu(stdV) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.template_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32) self.norm_volume = np.prod(volume.shape) self.scores = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.angles = gu.ones_like(self.volume, dtype=np.float32) * -1000 self.p = sum(self.mask.d_data)
def _init_weights(self, weight_shape, bias_shape): if self.weight is None: if self.name == 'noise': assert(weight_shape[0] == weight_shape[1]) self.weight = gpuarray.to_gpu(np.eye(weight_shape[0], dtype = np.float32)) else: self.weight = gpuarray.to_gpu(randn(weight_shape, np.float32) * self.initW) if self.bias is None: if self.initB > 0.0: self.bias = gpuarray.to_gpu((np.ones(bias_shape, dtype=np.float32) * self.initB)) else: self.bias = gpuarray.zeros(bias_shape, dtype=np.float32) Assert.eq(self.weight.shape, weight_shape) Assert.eq(self.bias.shape, bias_shape) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias) if self.momW > 0.0: if self.weightIncr is None: self.weightIncr = gpuarray.zeros_like(self.weight) if self.biasIncr is None: self.biasIncr = gpuarray.zeros_like(self.bias) Assert.eq(self.weightIncr.shape, weight_shape) Assert.eq(self.biasIncr.shape, bias_shape)
def _init_weights(self, weight_shape, bias_shape): if self.weight is None: if self.name == 'noise': assert (weight_shape[0] == weight_shape[1]) self.weight = gpuarray.to_gpu( np.eye(weight_shape[0], dtype=np.float32)) else: self.weight = gpuarray.to_gpu( randn(weight_shape, np.float32) * self.initW) if self.bias is None: if self.initB > 0.0: self.bias = gpuarray.to_gpu( (np.ones(bias_shape, dtype=np.float32) * self.initB)) else: self.bias = gpuarray.zeros(bias_shape, dtype=np.float32) Assert.eq(self.weight.shape, weight_shape) Assert.eq(self.bias.shape, bias_shape) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias) if self.momW > 0.0: if self.weightIncr is None: self.weightIncr = gpuarray.zeros_like(self.weight) if self.biasIncr is None: self.biasIncr = gpuarray.zeros_like(self.bias) Assert.eq(self.weightIncr.shape, weight_shape) Assert.eq(self.biasIncr.shape, bias_shape)
def dataToGPU(self): # Allocate device memory and copy host to device self.start_gpu_time.record() self.d_points = gpu.to_gpu( self.points.reshape(-1).view(gpu.vec.float3)) self.d_discPoints = gpu.zeros( shape=int(self.nPoints + self.nQueries), dtype=gpu.vec.int3) # dicretized location of points self.d_floodMap = gpu.empty(shape=int(self.mapLength * self.mapLength), dtype=np.int32) # 1+JFA map self.d_floodMap.fill(np.int32(-1)) # initialize to -1 self.d_tempMap = gpu.zeros_like( self.d_floodMap) # swap memory for floodMap self.d_tempMap.fill(np.int32(-1)) # initialize to -1 self.d_queryMap = gpu.zeros_like( self.d_floodMap) # query (interpolation) map self.d_queryMap.fill(np.int32(-1)) # initialize to -1 self.d_queryValues = gpu.zeros( shape=int(self.nQueries), dtype=gpu.vec.float2) # for calculating stolen area self.d_colors = gpu.zeros(shape=int(self.nPoints + self.nQueries), dtype=gpu.vec.uchar3) # color map self.d_voronoi = gpu.zeros( shape=int(self.mapLength * self.mapLength), dtype=gpu.vec.uchar3) # rendered Voronoi image self.end_gpu_time.record() self.end_gpu_time.synchronize() self.gpu_transfer_time += self.start_gpu_time.time_till( self.end_gpu_time) * 1e-3
def add_cld( self, name, proj_mats, offset_mats, cloud_xyz, kernel, scale_params, r_traj, r_traj_K, l_traj, l_traj_K, update_ptrs=False, ): """ does the normal add, but also adds the trajectories """ # don't update ptrs there, do it after this GPUContext.add_cld(self, name, proj_mats, offset_mats, cloud_xyz, kernel, scale_params, update_ptrs=False) self.r_traj.append(gpu_pad(r_traj, (MAX_TRAJ_LEN, DATA_DIM))) self.r_traj_K.append(gpu_pad(r_traj_K, (MAX_TRAJ_LEN, MAX_CLD_SIZE))) self.l_traj.append(gpu_pad(l_traj, (MAX_TRAJ_LEN, DATA_DIM))) self.l_traj_K.append(gpu_pad(l_traj_K, (MAX_TRAJ_LEN, MAX_CLD_SIZE))) self.r_traj_w.append(gpuarray.zeros_like(self.r_traj[-1])) self.l_traj_w.append(gpuarray.zeros_like(self.l_traj[-1])) self.l_traj_dims.append(l_traj.shape[0]) self.r_traj_dims.append(r_traj.shape[0]) if update_ptrs: self.update_ptrs()
def __init__(self, name, input_shape, n_out, epsW=0.001, epsB=0.002, initW = 0.01, initB = 0.0, weight = None, bias = None): Layer.__init__(self, name, 'fc') self.epsW = epsW self.epsB = epsB self.initW = initW self.initB = initB self.inputShape = input_shape self.inputSize, self.batchSize = input_shape self.outputSize = n_out self.weightShape = (self.outputSize, self.inputSize) if weight is None: self.weight = gpuarray.to_gpu(np.random.randn(*self.weightShape) * self.initW).astype(np.float32) else: self.weight = gpuarray.to_gpu(weight).astype(np.float32) if bias is None: self.bias = gpuarray.to_gpu(np.random.randn(self.outputSize, 1) * self.initB).astype(np.float32) else: self.bias = gpuarray.to_gpu(bias).astype(np.float32) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias)
def __init__(self, operator, data, u, v, tau, inner_iters, relative_tolerance=1e-20, absolute_tolerance=1e-19, verbose=0, EHs=None): self._data = data self._op = operator self._iters = inner_iters self._tau = tau self._relative_tolerance = relative_tolerance self._absolute_tolerance = absolute_tolerance src_shape = (self._data.nX1, self._data.nX2, 1) self._dest_shape = (self._data.nT, self._data.nC) self.converged = False self.iteration = 0 self._verbose = (verbose > 1) try: recondata_gpu = self._op.dgpu['recondata'] except NameError: recondata_gpu = gpuarray.to_gpu(self._data.recondata) # y if EHs is None: self.EHs = gpuarray.zeros(src_shape, dtype=np.complex64) self._op.adjoint(recondata_gpu, self.EHs) else: self.EHs = EHs self._m = u self.rhs = gpuarray.zeros_like(self.EHs) inner_cg_rhs(self.rhs, u, v, self.EHs, self._tau) self._p_k = gpuarray.zeros_like(self.EHs) self._v_k = gpuarray.zeros_like(self.EHs) self._residual_k = gpuarray_copy(self.rhs) self._forward(self._m, self._v_k) # initial guess self._residual_k = self._residual_k - self._v_k self._v_k = gpuarray_copy(self._residual_k) self._rho_0 = measure(self._v_k, self._residual_k) self._rho_k = self._rho_0 if self._rho_0 <= self._absolute_tolerance: if self._verbose: print("Already converged!") self.converged = True self.iteration = 0 self._p_k = gpuarray_copy(self._v_k)
def rfftn(self): # it seems that we can just take half of the original fft # in both arr, arrC so that we match what was here originally zeros = gpuarray.zeros_like(self.arr) arr = gpuarray.zeros_like(self.arr) arrC = gpuarray.zeros_like(self.arr) self.plan.execute(self.arr, zeros, data_out_re=arr, data_out_im=arrC) return CUDAArray(arr, arrC)
def same_reduce_multiview(target, vec, num_view): block = (target.size, 1, 1) grid = (1, 1) tmp = gpuarray.zeros_like(target) ids = gpuarray.zeros_like(target) _same_reduce_multiview_(target, vec, tmp, ids, I(num_view), block = block , grid = grid) tmp = tmp.reshape((1, tmp.size)) res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32)) add_row_sum_to_vec(res, tmp) return res.get()[0, 0]
def __init__(self, volume, template, mask, gpu): self.gpu = gpu self.volume = gu.to_gpu(volume) self.template = Volume(template) self.mask = gu.to_gpu(mask) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64) self.template_fft = gu.zeros_like(self.template.d_data, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32) self.norm_volume = np.prod(volume.shape) self.scores = gu.zeros_like(self.volume, dtype=np.float32) self.angles = gu.zeros_like(self.volume, dtype=np.float32)
def _conv3d_sep(data, kz, ky, kx): assert data.ndim == 3 with open(__cudafile__, "r") as f: _mod_conv = SourceModule(f.read()) gpu_conv3d_0 = _mod_conv.get_function("conv3d_axis0") gpu_conv3d_1 = _mod_conv.get_function("conv3d_axis1") gpu_conv3d_2 = _mod_conv.get_function("conv3d_axis2") d_gpu = asgpuarray(data) kz_gpu = asgpuarray(kz, np.float32) ky_gpu = asgpuarray(ky, np.float32) kx_gpu = asgpuarray(kx, np.float32) r1_gpu = gpuarray.zeros_like(d_gpu) r2_gpu = gpuarray.zeros_like(d_gpu) shape = np.asarray(data.shape[::-1], dtype=int3) block, grid = grid_kernel_config(gpu_conv3d_0, data.shape) gpu_conv3d_0( d_gpu, kz_gpu, r1_gpu, shape, np.int32(kz.size // 2), block=block, grid=grid, shared=(kz.size * kz.itemsize), ) gpu_conv3d_1( r1_gpu, ky_gpu, r2_gpu, shape, np.int32(ky.size // 2), block=block, grid=grid, shared=(ky.size * ky.itemsize), ) gpu_conv3d_2( r2_gpu, kx_gpu, r1_gpu, shape, np.int32(kx.size // 2), block=block, grid=grid, shared=(kx.size * kx.itemsize), ) return r1_gpu
def __init__(self, particle, reference, mask, wedge, maskIsSphere=True): import pycuda.gpuarray as gu from voltools.volume import Volume self.particle = gu.to_gpu(particle) self.template = Volume(reference) self.wedge = Volume(wedge) self.mask = Volume(mask) self.mask.d_data = gu.to_gpu(mask) self.fwd_plan = Plan(particle.shape, volume.dtype, np.complex64) self.inv_plan = Plan(particle.shape, np.complex64, volume.dtype) self.volume_fft = gu.zeros_like(self.particle, dtype=np.complex64) self.template_fft = gu.zeros_like(self.reference.d_data, dtype=np.complex64) self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32)
def __init__(self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8): super(AdamOptimizer, self).__init__(params, learning_rate_init) self.beta_1 = beta_1 self.beta_2 = beta_2 self.epsilon = epsilon self.t = 0 self.ms = [gpuarray.zeros_like(param) for param in params] self.vs = [gpuarray.zeros_like(param) for param in params]
def sqrt_normalize_gpu(img): global posr, negr, posa, nega, stream rgb = gpuarray.to_gpu(img[:, :, :3].copy()) a = gpuarray.to_gpu(img[:, :, 3].copy()) if not posr: posr = gpuarray.zeros_like(rgb) + 1 negr = gpuarray.zeros_like(rgb) - 1 posa = gpuarray.zeros_like(a) + 1 nega = gpuarray.zeros_like(a) - 1 rgb = cumath.sqrt(abs(rgb), stream=stream) * gpuarray.if_positive( rgb, posr, negr, stream=stream) a = cumath.sqrt(abs(a), stream=stream) * gpuarray.if_positive( a, posa, nega, stream=stream) return normalize_gpu(rgb, a)
def __init__(self, gpu_detector, ndaq=1): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.module = get_cu_module('daq.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels
def __init__(self, volume, template, gpu): self.gpu = gpu volume_gpu = gu.to_gpu(volume) self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64) self.volume_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) fft(volume_gpu, self.volume_fft, self.fwd_plan) self.template_fft = gu.zeros_like(volume_gpu, dtype=np.complex64) self.ccc_map = gu.zeros_like(volume_gpu, dtype=np.float32) self.norm_volume = gu.prod(volume_gpu.shape) #self.scores = gu.zeros_like(volume_gpu, dtype=np.float32) #self.angles = gu.zeros_like(volume_gpu, dtype=np.float32) self.padded_volume = gu.zeros_like(volume_gpu, dtype=np.float32) del volume_gpu self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype) self.template = Volume(template)
def tvdenoising3d(data, lamda=15, max_iter=100): assert data.ndim == 3 with open(op.join(__dirname__, "kernels", "tv.cu"), "r") as f: _mod_tv = SourceModule(f.read()) gpu_tv_u = _mod_tv.get_function("update_u") gpu_tv_p = _mod_tv.get_function("update_p") dsize = np.prod(data.shape) f_gpu = asgpuarray(data) u_gpu = f_gpu.copy() z_gpu = gpuarray.zeros_like(f_gpu) y_gpu = gpuarray.zeros_like(f_gpu) x_gpu = gpuarray.zeros_like(f_gpu) lamda = np.float32(1.0 / lamda) # z, y, x = map(np.int32, data.shape) shape = np.asarray(data.shape[::-1], dtype=int3) mtpb = gpu_tv_u.max_threads_per_block block, grid = flat_kernel_config(gpu_tv_u, data.shape) for i in range(max_iter): tau2 = np.float32(0.3 + 0.02 * i) tau1 = np.float32((1.0 / tau2) * ((1.0 / 6.0) - (5.0 / (15.0 + i)))) gpu_tv_u( f_gpu, z_gpu, y_gpu, x_gpu, u_gpu, tau1, lamda, shape, block=block, grid=grid, ) gpu_tv_p(u_gpu, z_gpu, y_gpu, x_gpu, tau2, shape, block=block, grid=grid) return u_gpu
def create_quantiles(data, params): global quantiles, q_lb, q_ub, mask sort_gpu(data) if mask.shape != data.shape: mask = gpuarray.zeros_like(data) n_lb = gpuarray.sum(data < mask) n_ub = gpuarray.sum(data > mask) fill_lb_quantiles(data, quantiles, n_lb, n_ub, q_lb, block=(quantiles.shape[0], 1, 1)) fill_ub_quantiles(data, quantiles, n_lb, n_ub, q_ub, block=(quantiles.shape[0], 1, 1)) q_lb = q_lb.reverse() p_ub = n_ub / (n_ub + n_lb) del n_lb, n_ub return data, q_lb.get(), q_ub.get(), probs * ( 1 - p_ub.get()), probs * p_ub.get()
def evolve_linear(z, deltax): """ Input type IN must be numpy or 21cmfast """ fgrowth = pb.fgrowth(z, COSMO['omega_M_0']) #normalized to 1 at z=0 #primordial_fgrowth = pb.fgrowth(INITIAL_REDSHIFT, cosmo['omega_M_0']) #normalized to 1 at z=0 updated = deltax * fgrowth np.save( parent_folder + "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format( z, HII_DIM, BOX_LEN), updated) if False: #velocity information may not be useful for linear field plan = Plan(HII_shape, dtype=np.complex64) deltak_d = deltax_d.astype(np.complex64) vbox_d = gpuarray.zeros_like(deltak_d) plan.execute(deltak_d) dDdt_D = np.float32(dDdt_D(z)) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(deltak_d, vbox_d, dDdt_D, DIM, np.int32(num), block=block_size, grid=grid_size) np.save( parent_folder + "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format( mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def _compile_kernels(self): mod = SourceModule( """ // Extract the upper diagonals of a square (N, N) matrix. __global__ void extract_upper_diags(float* matrix, float* diags, int N) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if ((x >= N) || (y >= N) || (y > x)) return; int pos = y*N+x; int my_diag = x-y; diags[my_diag * N + x] = matrix[pos]; } """ ) self.extract_diags_kernel = mod.get_function("extract_upper_diags") self._blocks = (32, 32, 1) self._grid = ( updiv(self.nframes, self._blocks[0]), updiv(self.nframes, self._blocks[1]), 1 ) self.d_diags = garray.zeros((self.nframes, self.nframes), dtype=np.float32) self.d_sumdiags1 = garray.zeros(self.nframes, dtype=np.float32) self.d_sumdiags2 = garray.zeros_like(self.d_sumdiags1) self._kern_args = [ None, self.d_diags, np.int32(self.nframes), ]
def _correlate_fft(self, frames_flat, cufft_plan): npix = frames_flat.shape[1] d_in = cufft_plan.data_in d_in.fill(0) f_out1 = cufft_plan.data_out f_out2 = garray.zeros_like(cufft_plan.data_out) # fft(pad(frames_flat), axis=1) d_in[:, :self.nframes] = frames_flat.T.astype("f") f_out1 = cufft_plan.fft(d_in, output=f_out1) # frames_flat.sum(axis=1) # skmisc.sum() only works on base data, not gpuarray views, # so we sum on the whole array and then extract the right subset. skmisc.sum(d_in, axis=0, out=self.d_sums_denom_tmp) # fft(pad(frames_flat[::-1]), axis=1) d_in.fill(0) d_in[:, :self.nframes] = frames_flat.T[:, ::-1].astype("f") f_out2 = cufft_plan.fft(d_in, output=f_out2) # product, ifft f_out1 *= f_out2 num = cufft_plan.ifft(f_out1, output=d_in) # numerator of g_2 skmisc.sum(num, axis=0, out=self.d_sums) # denominator of g_2: correlate(d_sums_denom) self._correlate_denom(npix) self.d_numerator /= self.d_denom res = self.d_numerator.get() return res
def execute(self): resulting_image = None nda = None f_first = True img_cnt = 0 for itr_img in self.images_iterator: img_cnt += 1 if f_first: nda = np.ndarray(shape=itr_img.image.shape, dtype=itr_img.image.dtype) nda[:] = itr_img.image[:] self.resulting_image = itr_img resulting_image = gpuarray.to_gpu(nda) current_image = gpuarray.zeros_like(resulting_image) f_first = False shape = itr_img.shape continue if shape != itr_img.shape: img_cnt -= 1 continue current_image.set(itr_img.image) resulting_image += current_image resulting_image /= img_cnt self.resulting_image.image[:] = resulting_image.get()
def fprop(self, input, output, train=TRAIN): self.denom = gpuarray.zeros_like(input) cudaconv2.convResponseNormCrossMap(input, self.denom, output, self.numColor, self.size, self.scaler, self.pow, self.blocked) if PFout: print_matrix(output, self.name)
def cuda_run(self, prefix, supportK): print('Running Eclat in recursive: number of itemsets found:', len(self.support_list), end='\r') while supportK: itemset, bitvector = supportK.pop(0) support = gpuarray.sum(bitvector).get() if support >= self.min_support: self.support_list[frozenset(sorted(prefix + [itemset]))] = int(support) suffix = [] for itemset_sub, bitvector_sub in supportK: if gpuarray.sum(bitvector_sub).get() >= self.min_support: if self.use_optimal: union_bitvector = bitvector.__mul__(bitvector_sub) else: union_bitvector = gpuarray.zeros_like(bitvector) self.multiply(union_bitvector, bitvector, bitvector_sub, block=self.block, grid=self.grid) if gpuarray.sum( union_bitvector).get() >= self.min_support: suffix.append((itemset_sub, union_bitvector)) self.cuda_run( prefix + [itemset], sorted(suffix, key=lambda x: int(x[0]), reverse=True))
def softmax_back(d_a, d_error, s): d_out = gpuarray.zeros_like(d_a) thread_size = min(d_out.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1) softmax_back_kernel(d_a, d_error, d_out, numpy.float32(s), numpy.int32(d_out.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def ccl3d(labels, remap=True): assert labels.ndim == 3 assert labels.dtype == np.uint32 with open(op.join(__dirname__, 'kernels', 'ccl3d.cu'), 'r') as f: _mod_conv = SourceModule(f.read()) gpu_ccl_local = _mod_conv.get_function('uf_local') gpu_ccl_global = _mod_conv.get_function('uf_global') gpu_ccl_final = _mod_conv.get_function('uf_final') labels_gpu = asgpuarray(labels, dtype=np.uint32) result_gpu = gpuarray.zeros_like(labels_gpu) shape = np.asarray(tuple(labels.shape[::-1]), dtype=int3) block, grid = grid_kernel_config(gpu_ccl_local, labels.shape) shared = int(np.prod(block) * 8) gpu_ccl_local(labels_gpu, result_gpu, shape, block=block, grid=grid, shared=shared) gpu_ccl_global(labels_gpu, result_gpu, shape, block=block, grid=grid) gpu_ccl_final(result_gpu, shape, block=block, grid=grid) if remap: return remap_labels(result_gpu.get()) return result_gpu
def custom_filter_gpu(image, template): if not (template.shape[0] == template.shape[1]): raise ValueError("Шаблона должен быть квадратным") if template.shape[0] % 2 == 0: raise ValueError("Сторона шаблона должена быть нечетной") filtersize05 = template.shape[0] // 2 image_gpu = gpuarray.to_gpu(image) filtered_image = gpuarray.zeros_like(image_gpu) s = template.sum() window = gpuarray.to_gpu( np.array([coef / s for coef in template.flatten()])) shape = filtered_image.shape wid = 0 for i in range(shape[0]): for j in range(shape[1]): for color in range(shape[2]): wid = 0 for m in range(i - filtersize05, i + filtersize05 + 1): for n in range(j - filtersize05, j + filtersize05 + 1): if 0 <= m and m < shape[0] and 0 <= n and n < shape[1]: filtered_image[i][j][ color] += window[wid] * image_gpu[m][n][color] wid += 1 return filtered_image.get()
def ewsum(d_a, d_w): """ YORI NOTES This method is faster than CPU if num_w is large, and non_width is small: When num_w is large, the for loop is small When non_width is large, there are more threads necessary """ width = d_a.shape[0] total_dim = d_a.size num_w = d_w.shape[0] d_tmp_out = gpuarray.zeros_like(d_a) thread_size = min(d_a.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1) ewsum_kernel(d_a, d_w, d_tmp_out, numpy.int32(num_w), numpy.int32(width), numpy.int32(total_dim), block=(thread_size,1,1), grid=(block_size,1,1)) # TODO: There HAS to be a better way to do this x = width / num_w d_out = gpuarray.zeros((x,) + d_a.shape[1:], numpy.float32) thread_size = min(d_out.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1) ewsum_sum_kernel(d_tmp_out, d_out, numpy.int32(num_w), numpy.int32(width), numpy.int32(total_dim), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def test_cublasDcopy(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.zeros_like(x_gpu) cublas.cublasDcopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), x_gpu.get())
def setUp(self): self.comm = MPI.COMM_WORLD self.gpu_comm = GPUComm(MPI.COMM_WORLD) self.cpu_send = np.random.rand(*TEST_DIMS).astype(np.float32) self.gpu_send = gpu.to_gpu(self.cpu_send) self.cpu_recv = np.zeros_like(self.cpu_send) self.gpu_recv = gpu.zeros_like(self.gpu_send)
def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None): if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like( self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.module = cutools.get_cu_module('daq.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.earliest_time_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_int_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.detector_gpu = gpu_detector # struct not made in opencl mode, so we keep a copy of the class self.module = cltools.get_cl_module('daq.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels
def same_reduce_multiview(target, vec, num_view): block = (target.size, 1, 1) grid = (1, 1) tmp = gpuarray.zeros_like(target) ids = gpuarray.zeros_like(target) _same_reduce_multiview_(target, vec, tmp, ids, I(num_view), block=block, grid=grid) tmp = tmp.reshape((1, tmp.size)) res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32)) add_row_sum_to_vec(res, tmp) return res.get()[0, 0]
def exp(d_a, mode=MathModes.ACC): if mode == MathModes.ACC: return cumath.exp(d_a) d_out = gpuarray.zeros_like(d_a) thread_size = min(d_a.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1) exp_fast_kernel(d_a, d_out, numpy.int32(d_a.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def __init__(self, name, type, epsW, epsB, initW, initB, momW, momB, wc, weight, bias, weightIncr , biasIncr, weightShape, biasShape): Layer.__init__(self, name, type) self.epsW = F(epsW) self.epsB = F(epsB) self.initW = initW self.initB = initB self.momW = F(momW) self.momB = F(momB) self.wc = F(wc) if weight is None: self.weight = gpuarray.to_gpu(randn(weightShape, np.float32) * self.initW) else: print >> sys.stderr, 'init weight from disk' self.weight = gpuarray.to_gpu(weight)#.astype(np.float32) if bias is None: if self.initB > 0.0: self.bias = gpuarray.to_gpu((np.ones(biasShape, dtype=np.float32) * self.initB)) else: self.bias = gpuarray.zeros(biasShape, dtype=np.float32) else: print >> sys.stderr, 'init bias from disk' self.bias = gpuarray.to_gpu(bias).astype(np.float32) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias) if self.momW > 0.0: if weightIncr is None: self.weightIncr = gpuarray.zeros_like(self.weight) else: print >> sys.stderr, 'init weightIncr from disk' #weightIncr = np.require(weightIncr, dtype = np.float, requirements = 'C') self.weightIncr = gpuarray.to_gpu(weightIncr) if self.momW > 0.0: if biasIncr is None: self.biasIncr = gpuarray.zeros_like(self.bias) else: print >> sys.stderr, 'init biasIncr from disk' #biasIncr = np.require(biasIncr, dtype = np.float, requirements = 'C') self.biasIncr = gpuarray.to_gpu(biasIncr)
def rectify_back(d_a, d_error, inplace=False): if inplace: d_out = d_a else: d_out = gpuarray.zeros_like(d_a) thread_size = min(d_out.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1) rectify_back_kernel(d_a, d_error, d_out, numpy.int32(d_out.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def add_cld(self, name, proj_mats, offset_mats, cloud_xyz, kernel, scale_params, update_ptrs=False): """ adds a new cloud to our context for batch processing """ self.check_cld(cloud_xyz) self.ptrs_valid = False self.N += 1 self.seg_names.append(name) self.names2inds[name] = self.N - 1 self.tps_params.append(self.default_tps_params.copy()) self.trans_d.append(self.tps_params[-1][0, :]) self.lin_dd.append(self.tps_params[-1][1 : DATA_DIM + 1, :]) self.w_nd.append(self.tps_params[-1][DATA_DIM + 1 :, :]) self.scale_params.append(scale_params) n = cloud_xyz.shape[0] for b in self.bend_coefs: proj_mat = proj_mats[b] offset_mat = offset_mats[b] self.proj_mats[b].append(gpu_pad(proj_mat, (MAX_CLD_SIZE + DATA_DIM + 1, MAX_CLD_SIZE))) if offset_mat.shape != (n + DATA_DIM + 1, DATA_DIM): raise ValueError("Offset Matrix has incorrect dimension") self.offset_mats[b].append(gpu_pad(offset_mat, (MAX_CLD_SIZE + DATA_DIM + 1, DATA_DIM))) if n > MAX_CLD_SIZE or cloud_xyz.shape[1] != DATA_DIM: raise ValueError("cloud_xyz has incorrect dimension") self.pts.append(gpu_pad(cloud_xyz, (MAX_CLD_SIZE, DATA_DIM))) if kernel.shape != (n, n): raise ValueError("dimension mismatch b/t kernel and cloud") self.kernels.append(gpu_pad(kernel, (MAX_CLD_SIZE, MAX_CLD_SIZE))) self.dims.append(n) self.pts_w.append(gpuarray.zeros_like(self.pts[-1])) self.pts_t.append(gpuarray.zeros_like(self.pts[-1])) self.corr_cm.append(gpuarray.zeros((MAX_CLD_SIZE, MAX_CLD_SIZE), np.float32)) self.corr_rm.append(gpuarray.zeros((MAX_CLD_SIZE, MAX_CLD_SIZE), np.float32)) self.r_coefs.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32)) self.c_coefs_rn.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32)) self.c_coefs_cn.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32)) if update_ptrs: self.update_ptrs()
def map_elementwise_max(self, op, field_expr): field = self.rec(field_expr) field_out = gpuarray.zeros_like(field) func_rec = self.executor.get_elwise_max_kernel(field.dtype) func_rec.func.prepared_call((func_rec.grid_dim, 1), field.gpudata, field_out.gpudata, func_rec.mb_count) return field_out
def expit_back(d_a, d_error): """Implments the following function out = in * (1 - in) * error """ d_out = gpuarray.zeros_like(d_a) thread_size = min(d_a.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1) expit_back_kernel(d_a, d_error, d_out, numpy.int32(d_a.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def robust_pca(D): """ Parrallel RPCA using ALM, adapted from https://github.com/nwbirnie/rpca. Takes and returns numpy arrays """ M = gpuarray.to_gpu(D) L = gpuarray.zeros_like(M) S = gpuarray.zeros_like(M) Y = gpuarray.zeros_like(M) print M.shape mu = (M.shape[0] * M.shape[1]) / (4.0 * L1Norm(M)) lamb = max(M.shape) ** -0.5 while not converged(M, L, S): L = svd_shrink(M - S - (mu**-1) * Y, mu) S = shrink(M - L + (mu**-1) * Y, lamb * mu) Y = Y + mu * (M - L - S) return L.get(), S.get()
def test_2d_fp_surfaces(self): orden = "C" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints)[:] A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized myKernRW = ''' #include <pycuda-helpers.hpp> surface<void, cudaSurfaceType2DLayered> mtx_tex; __global__ void copy_texture(cuPres *dest, int rw) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int layer = 1; int tid = row + col*blockDim.x*gridDim.x ; if (rw==0){ cuPres aux = dest[tid]; fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);} else { cuPres aux = 0; fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp); dest[tid] = aux; } } ''' myKernRW = myKernRW.replace('fpName',fpName_str) myKernRW = myKernRW.replace('cuPres',prec_str) modW = SourceModule(myKernRW) copy_texture = modW.get_function("copy_texture") mtx_tex = modW.get_surfref("mtx_tex") cuBlock = (8,8,1) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,1) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) copy_texture.prepare('Pi')#,texrefs=[mtx_tex]) A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True) A_cpu = A_gpu.get() # To remember original array mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def gpu_fft(data, inverse=False): global plan, ctx, stream ##cuda if not plan: print 'building plan', data.shape plan = Plan(data.shape, stream=stream, wait_for_finish=True) result = gpuarray.zeros_like(data) plan.execute(data, data_out=result, inverse=inverse) return result
def expit(d_a, mode=MathModes.ACC): """Implements the expit function (aka sigmoid) expit(x) = 1 / (1 + exp(-x)) """ d_out = gpuarray.zeros_like(d_a) thread_size = min(d_a.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1) kernel = expit_fast_kernel if mode == MathModes.FAST else expit_kernel kernel(d_a, d_out, numpy.int32(d_a.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def __init__(self, mesh, context=None): ''' Args: mesh The mesh on which the solver will operate. The dimensionality is deducted from mesh.dimension ''' # create the mesh grid and compute the greens function on it self.mesh = mesh self._context = context mesh_shape = self.mesh.shape # nz, ny, (nx) mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx) mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx self.fgreentr = gpuarray.empty(mesh_shape2, dtype=np.complex128) self.tmpspace = gpuarray.zeros_like(self.fgreentr) sizeof_complex = np.dtype(np.complex128).itemsize # dimensionality function dispatch dim = self.mesh.dimension self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd') self._mirror = getattr(self, '_mirror' + str(dim) + 'd') copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d} memcpy_nd = copy_fn[str(dim) + 'd'] dim_args = self.mesh.shape self._cpyrho2tmp = memcpy_nd( src=None, dst=self.tmpspace, # None because src(rho) not yet known src_pitch=self.mesh.nx*sizeof_complex, dst_pitch=2*self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=self.mesh.ny, dst_height=2*self.mesh.ny) self._cpytmp2rho = memcpy_nd( src=self.tmpspace, dst=None, # None because dst(rho) not yet know src_pitch=2*self.mesh.nx*sizeof_complex, dst_pitch=self.mesh.nx*sizeof_complex, dim_args=dim_args, itemsize=np.dtype(np.complex128).itemsize, src_height=2*self.mesh.ny, dst_height=self.mesh.ny) mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh_shape[i]+1) * mesh_distances[i] for i in xrange(self.mesh.dimension) ] # mesh_arr is [mz, my, mx] mesh_grids = np.meshgrid(*mesh_arr, indexing='ij') fgreen = self._fgreen(*mesh_grids) fgreen = self._mirror(fgreen) self.plan_forward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) self.plan_backward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128, out_dtype=np.complex128) cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
def __init__(self, name, type, epsW, epsB, initW, initB, weight, bias, weightShape, biasShape): Layer.__init__(self, name, type) self.epsW = epsW self.epsB = epsB self.initW = initW self.initB = initB if weight is None: self.weight = gpuarray.to_gpu(np.random.randn(*weightShape) * self.initW).astype(np.float32) else: self.weight = gpuarray.to_gpu(weight).astype(np.float32) if bias is None: self.bias = gpuarray.to_gpu(np.random.randn(*biasShape) * self.initB).astype(np.float32) else: self.bias = gpuarray.to_gpu(bias).astype(np.float32) self.weightGrad = gpuarray.zeros_like(self.weight) self.biasGrad = gpuarray.zeros_like(self.bias)
def same_reduce(target, vec): ''' Return the number of same values in the same offset of two vecs ''' block = (target.size, 1, 1) grid = (1, 1) tmp = gpuarray.zeros_like(target) _same_reduce_(target, vec, tmp, block=block, grid=grid) tmp.shape = (1, tmp.size) res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32)) add_row_sum_to_vec(res, tmp) return int(res.get()[0, 0])
def test_cublasZgeam(self): a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128) b = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.complex128(np.random.rand()+1j*np.random.rand()) beta = np.complex128(np.random.rand()+1j*np.random.rand()) cublas.cublasZgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b)
def _transform_wf(self, ps, qs): result_real_gpu = gpuarray.zeros(N.broadcast(ps, qs).shape, N.double) result_imag_gpu = gpuarray.zeros_like(result_real_gpu) self._kernel.prepared_call(self._gpu_grid, self._gpu_block, gpuarray.to_gpu(N.ascontiguousarray(ps)).gpudata, gpuarray.to_gpu(N.ascontiguousarray(qs)).gpudata, self._wf_q_grid_gpu.gpudata, self._wf_gpu.gpudata, result_real_gpu.gpudata, result_imag_gpu.gpudata, ) return result_real_gpu.get() + 1j * result_imag_gpu.get()
def run(self): drv.init() a0=numpy.zeros((p,),dtype=numpy.complex64) self.dev = drv.Device(self.number) self.ctx = self.dev.make_context() #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING) print drv.mem_get_info() self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64) self.gpu_b = garray.zeros_like(self.gpu_a) self.gpu_a = garray.to_gpu(self.input_cpu) plan = Plan(a0.shape,context=self.ctx) plan.execute(self.gpu_a, self.gpu_b, batch=p/m) self.temp = self.gpu_b.get() print output_cpu._closed self.output_cpu.put(self.temp)
def _transform_wf(self, t): result_real_gpu = gpuarray.zeros((len(self._p_grid), len(self._q_grid)), np.double) result_imag_gpu = gpuarray.zeros_like(result_real_gpu) self._kernel.prepared_call(self._gpu_grid, self._gpu_block, self._p_grid_gpu.gpudata, self._q_grid_gpu.gpudata, self._wf_q_grid_gpu.gpudata, self._wfs_gpu.gpudata, self._energies_gpu.gpudata, t, result_real_gpu.gpudata, result_imag_gpu.gpudata, ) return result_real_gpu.get() + 1j * result_imag_gpu.get()
def backprop(self, input_data, targets, cache=None): df_input = gpuarray.zeros_like(input_data) if cache is None: cache = self.n_tasks * [None] gradients = [] for targets_task, cache_task, task, task_weight in \ izip(targets, cache, self.tasks, self.task_weights): gradients_task, df_input_task = \ task.backprop(input_data, targets_task, cache_task) df_input = df_input.mul_add(1., df_input_task, task_weight) gradients.extend(gradients_task) return gradients, df_input
def wsparsify(w_gpu, percentage): """ Keeps only as many entries nonzero as specified by percentage. """ w = w_gpu.get() vals = sort(w)[::-1] idx = floor(prod(w.shape()) * percentage/100) zw_gpu = cua.zeros_like(w_gpu) # gpu array filled with zeros tw_gpu = cua.empty_like(w_gpu) # gpu array containing threshold tw_gpu.fill(vals[idx]) w_gpu = cua.if_positive(w_gpu > tw_gpu, w_gpu, zw_gpu) del zw_gpu del tw_gpu return w_gpu