class NewCUDARandom: def __init__(self, env, double): self._env = env p = double_precision if double else single_precision self._scalar_dtype = p.scalar.dtype self._complex_dtype = p.complex.dtype self._scalar_cast = p.scalar.cast self._complex_cast = p.complex.cast from pycuda.curandom import XORWOWRandomNumberGenerator as RNG self._rng = RNG() kernel_template = """ <%! from math import pi %> EXPORTED_FUNC void scaleRandoms(int gsize, GLOBAL_MEM COMPLEX* data, COMPLEX loc, SCALAR scale) { int id = GLOBAL_ID_FLAT; if(id >= gsize) return; COMPLEX r = data[id]; r.x += loc.x; r.y += loc.y; r.x *= scale; r.y *= scale; data[id] = r; } """ self._program = self._env.compile(kernel_template, double=double) self._scaleRandoms = self._program.scaleRandoms def random_normal(self, result, scale=1.0, loc=0.0): self._rng.fill_normal(result, stream=self._env.stream) self._scaleRandoms(result.size, result, self._complex_cast(loc), self._scalar_cast(scale / numpy.sqrt(2.0)))
class OceanStateNoise(object): """ Generating random perturbations for a ocean state. Perturbation for the surface field, dEta, is produced with a covariance structure according to a SOAR function, while dHu and dHv are found by the geostrophic balance to avoid shock solutions. """ def __init__(self, gpu_ctx, gpu_stream, nx, ny, dx, dy, boundaryConditions, staggered, soar_q0=None, soar_L=None, interpolation_factor=1, use_lcg=False, angle=np.array([[0]], dtype=np.float32), coriolis_f=np.array([[0]], dtype=np.float32), block_width=16, block_height=16): """ Initiates a class that generates small scale geostrophically balanced perturbations of the ocean state. (nx, ny): number of internal grid cells in the domain (dx, dy): size of each grid cell soar_q0: amplitude parameter for the perturbation, default: dx*1e-5 soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, and then interpolated down to the computational mesh. The coarse mesh will then have (nx/interpolation_factor, ny/interpolation_factor) grid cells. use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array (block_width, block_height): The size of each GPU block """ self.use_lcg = use_lcg # Set numpy random state self.random_state = np.random.RandomState() # Make sure that all variables initialized within ifs are defined self.random_numbers = None self.rng = None self.seed = None self.host_seed = None self.gpu_ctx = gpu_ctx self.gpu_stream = gpu_stream self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) self.dy = np.float32(dy) self.staggered = np.int(0) if staggered: self.staggered = np.int(1) # The cutoff parameter is hard-coded. # The size of the cutoff determines the computational radius in the # SOAR function. Hence, the size of the local memory in the OpenCL # kernels has to be hard-coded. self.cutoff = np.int32(config.soar_cutoff) # Check that the interpolation factor plays well with the grid size: assert (interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer' assert (nx % interpolation_factor == 0 ), 'nx must be divisible by the interpolation factor' assert (ny % interpolation_factor == 0 ), 'ny must be divisible by the interpolation factor' self.interpolation_factor = np.int32(interpolation_factor) # The size of the coarse grid self.coarse_nx = np.int32(nx / self.interpolation_factor) self.coarse_ny = np.int32(ny / self.interpolation_factor) self.coarse_dx = np.float32(dx * self.interpolation_factor) self.coarse_dy = np.float32(dy * self.interpolation_factor) self.periodicNorthSouth = np.int32( boundaryConditions.isPeriodicNorthSouth()) self.periodicEastWest = np.int32( boundaryConditions.isPeriodicEastWest()) # Size of random field and seed # The SOAR function is a stencil which requires cutoff number of grid cells, # and the interpolation operator requires further 2 ghost cell values in each direction. # The random field must therefore be created with 2 + cutoff number of ghost cells. self.rand_ghost_cells_x = np.int32(2 + self.cutoff) self.rand_ghost_cells_y = np.int32(2 + self.cutoff) if self.periodicEastWest: self.rand_ghost_cells_x = np.int32(0) if self.periodicNorthSouth: self.rand_ghost_cells_y = np.int32(0) self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x) self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y) # Since normal distributed numbers are generated in pairs, we need to store half the number of # of seed values compared to the number of random numbers. self.seed_ny = np.int32(self.rand_ny) self.seed_nx = np.int32(np.ceil(self.rand_nx / 2)) # Generate seed: self.floatMax = 2147483648.0 if self.use_lcg: self.host_seed = self.random_state.rand( self.seed_ny, self.seed_nx) * self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') if not self.use_lcg: self.rng = XORWOWRandomNumberGenerator() else: self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True) # Constants for the SOAR function: self.soar_q0 = np.float32(self.dx / 100000) if soar_q0 is not None: self.soar_q0 = np.float32(soar_q0) self.soar_L = np.float32(0.75 * self.coarse_dx) if soar_L is not None: self.soar_L = np.float32(soar_L) # Allocate memory for random numbers (xi) self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate a second buffer for random numbers (nu) self.perpendicular_random_numbers_host = np.zeros( (self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.perpendicular_random_numbers = Common.CUDAArray2D( self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate memory for coarse buffer if needed # Two ghost cells in each direction needed for bicubic interpolation self.coarse_buffer_host = np.zeros( (self.coarse_ny + 4, self.coarse_nx + 4), dtype=np.float32, order='C') self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host) # Allocate extra memory needed for reduction kernels. # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu] self.reduction_buffer = None reduction_buffer_host = np.zeros((1, 3), dtype=np.float32) self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host) # Generate kernels self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \ defines={'block_width': block_width, 'block_height': block_height}, compile_args={ 'options': ["--use_fast_math", "--maxrregcount=32"] }) self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \ defines={}) # Get CUDA functions and define data types for prepared_{async_}call() # Generate kernels self.squareSumKernel = self.reduction_kernels.get_function("squareSum") self.squareSumKernel.prepare("iiPP") self.squareSumDoubleKernel = self.reduction_kernels.get_function( "squareSumDouble") self.squareSumDoubleKernel.prepare("iiPPP") self.makePerpendicularKernel = self.kernels.get_function( "makePerpendicular") self.makePerpendicularKernel.prepare("iiPiPiP") self.uniformDistributionKernel = self.kernels.get_function( "uniformDistribution") self.uniformDistributionKernel.prepare("iiiPiPi") self.normalDistributionKernel = None if self.use_lcg: self.normalDistributionKernel = self.kernels.get_function( "normalDistribution") self.normalDistributionKernel.prepare("iiiPiPi") self.soarKernel = self.kernels.get_function("SOAR") self.soarKernel.prepare("iifffffiiPiPii") self.geostrophicBalanceKernel = self.kernels.get_function( "geostrophicBalance") self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif") self.bicubicInterpolationKernel = self.kernels.get_function( "bicubicInterpolation") self.bicubicInterpolationKernel.prepare( "iiiiffiiiiffiiffffPiPiPiPiPif") #Compute kernel launch parameters self.local_size = (block_width, block_height, 1) self.local_size_reductions = (128, 1, 1) self.global_size_reductions = (1, 1) # Launch one thread for each seed, which in turns generates two iid N(0,1) self.global_size_random_numbers = ( \ int(np.ceil(self.seed_nx / float(self.local_size[0]))), \ int(np.ceil(self.seed_ny / float(self.local_size[1]))) \ ) # Launch on thread for each random number (in order to create perpendicular random numbers) self.global_size_perpendicular = ( \ int(np.ceil(self.rand_nx / float(self.local_size[0]))), \ int(np.ceil(self.rand_ny / float(self.local_size[1]))) \ ) # Launch one thread per SOAR-correlated result - need to write to two ghost # cells in order to do bicubic interpolation based on the result self.global_size_SOAR = ( \ int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \ int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \ ) # One thread per resulting perturbed grid cell self.global_size_geo_balance = ( \ int(np.ceil( (self.nx)/float(self.local_size[0]))), \ int(np.ceil( (self.ny)/float(self.local_size[1]))) \ ) # Texture for coriolis field self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex") if isinstance(coriolis_f, cuda.Array): # coriolis_f is already a texture, so we just set the reference self.coriolis_texref.set_array(coriolis_f) else: #Upload data to GPU and bind to texture reference self.coriolis_texref.set_array( cuda.np_to_array(np.ascontiguousarray(coriolis_f, dtype=np.float32), order="C")) # Set texture parameters self.coriolis_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.coriolis_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.coriolis_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing # FIXME! Allow different versions of coriolis, similar to CDKLM # Texture for angle towards north self.angle_texref = self.kernels.get_texref("angle_tex") if isinstance(angle, cuda.Array): # angle is already a texture, so we just set the reference self.angle_texref.set_array(angle) else: #Upload data to GPU and bind to texture reference self.angle_texref.set_array( cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C")) # Set texture parameters self.angle_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.angle_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.angle_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing def __del__(self): self.cleanUp() def cleanUp(self): if self.rng is not None: self.rng = None if self.seed is not None: self.seed.release() if self.random_numbers is not None: self.random_numbers.release() if self.perpendicular_random_numbers is not None: self.perpendicular_random_numbers.release() if self.reduction_buffer is not None: self.reduction_buffer.release() self.gpu_ctx = None gc.collect() @classmethod def fromsim(cls, sim, soar_q0=None, soar_L=None, interpolation_factor=1, use_lcg=False, block_width=16, block_height=16): staggered = False if isinstance(sim, FBL.FBL) or isinstance(sim, CTCS.CTCS): staggered = True return cls(sim.gpu_ctx, sim.gpu_stream, sim.nx, sim.ny, sim.dx, sim.dy, sim.boundary_conditions, staggered, soar_q0=soar_q0, soar_L=soar_L, interpolation_factor=interpolation_factor, angle=sim.angle_texref.get_array(), coriolis_f=sim.coriolis_texref.get_array(), use_lcg=use_lcg, block_width=block_width, block_height=block_height) def getSeed(self): assert ( self.use_lcg ), "getSeed is only valid if LCG is used as pseudo-random generator." return self.seed.download(self.gpu_stream) def resetSeed(self): assert ( self.use_lcg ), "resetSeed is only valid if LCG is used as pseudo-random generator." # Generate seed: self.floatMax = 2147483648.0 self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx) * self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') self.seed.upload(self.gpu_stream, self.host_seed) def getRandomNumbers(self): return self.random_numbers.download(self.gpu_stream) def getPerpendicularRandomNumbers(self): return self.perpendicular_random_numbers.download(self.gpu_stream) def getCoarseBuffer(self): return self.coarse_buffer.download(self.gpu_stream) def getReductionBuffer(self): return self.reduction_buffer.download(self.gpu_stream) def generateNormalDistribution(self): if not self.use_lcg: self.rng.fill_normal(self.random_numbers.data, stream=self.gpu_stream) else: self.normalDistributionKernel.prepared_async_call( self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.random_numbers.data.gpudata, self.random_numbers.pitch) def generateNormalDistributionPerpendicular(self): if not self.use_lcg: self.rng.fill_normal(self.perpendicular_random_numbers.data, stream=self.gpu_stream) else: self.normalDistributionKernel.prepared_async_call( self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch) def generateUniformDistribution(self): # Call kernel -> new random numbers if not self.use_lcg: self.rng.fill_uniform(self.random_numbers.data, stream=self.gpu_stream) else: self.uniformDistributionKernel.prepared_async_call( self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.random_numbers.data.gpudata, self.random_numbers.pitch) def perturbSim(self, sim, q0_scale=1.0, update_random_field=True, perturbation_scale=1.0, perpendicular_scale=0.0, align_with_cell_i=None, align_with_cell_j=None, stream=None): """ Generating a perturbed ocean state and adding it to sim's ocean state """ self.perturbOceanState(sim.gpu_data.h0, sim.gpu_data.hu0, sim.gpu_data.hv0, sim.bathymetry.Bi, sim.f, beta=sim.coriolis_beta, g=sim.g, y0_reference_cell=sim.y_zero_reference_cell, ghost_cells_x=sim.ghost_cells_x, ghost_cells_y=sim.ghost_cells_y, q0_scale=q0_scale, update_random_field=update_random_field, perturbation_scale=perturbation_scale, perpendicular_scale=perpendicular_scale, align_with_cell_i=align_with_cell_i, align_with_cell_j=align_with_cell_j, land_mask_value=sim.bathymetry.mask_value, stream=stream) def perturbOceanState(self, eta, hu, hv, H, f, beta=0.0, g=9.81, y0_reference_cell=0, ghost_cells_x=0, ghost_cells_y=0, q0_scale=1.0, update_random_field=True, perturbation_scale=1.0, perpendicular_scale=0.0, align_with_cell_i=None, align_with_cell_j=None, land_mask_value=np.float32(1.0e20), stream=None): """ Apply the SOAR Q covariance matrix on the random ocean field which is added to the provided buffers eta, hu and hv. eta: surface deviation - CUDAArray2D object. hu: volume transport in x-direction - CUDAArray2D object. hv: volume transport in y-dirextion - CUDAArray2D object. Optional parameters not used else_where: q0_scale=1: scale factor to the SOAR amplitude parameter q0 update_random_field=True: whether to generate new random numbers or use those already present in the random numbers buffer perturbation_scale=1.0: scale factor to the perturbation of the eta field perpendicular_scale=0.0: scale factor for additional perturbation from the perpendicular random field align_with_cell_i=None, align_with_cell_j=None: Index to a cell for which to align the coarse grid. The default value align_with_cell=None corresponds to zero offset between the coarse and fine grid. """ if stream is None: stream = self.gpu_stream if update_random_field: # Need to update the random field, requiering a global sync self.generateNormalDistribution() soar_q0 = np.float32(self.soar_q0 * q0_scale) offset_i, offset_j = self._obtain_coarse_grid_offset( align_with_cell_i, align_with_cell_j) # Generate the SOAR field on the coarse grid self.soarKernel.prepared_async_call( self.global_size_SOAR, self.local_size, stream, self.coarse_nx, self.coarse_ny, self.coarse_dx, self.coarse_dy, soar_q0, self.soar_L, np.float32(perturbation_scale), self.periodicNorthSouth, self.periodicEastWest, self.random_numbers.data.gpudata, self.random_numbers.pitch, self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, np.int32(0)) if perpendicular_scale > 0: self.soarKernel.prepared_async_call( self.global_size_SOAR, self.local_size, stream, self.coarse_nx, self.coarse_ny, self.coarse_dx, self.coarse_dy, soar_q0, self.soar_L, np.float32(perpendicular_scale), self.periodicNorthSouth, self.periodicEastWest, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch, self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, np.int32(1)) if self.interpolation_factor > 1: self.bicubicInterpolationKernel.prepared_async_call( self.global_size_geo_balance, self.local_size, stream, self.nx, self.ny, np.int32(ghost_cells_x), np.int32(ghost_cells_y), self.dx, self.dy, self.coarse_nx, self.coarse_ny, np.int32(ghost_cells_x), np.int32(ghost_cells_y), self.coarse_dx, self.coarse_dy, np.int32(offset_i), np.int32(offset_j), np.float32(g), np.float32(f), np.float32(beta), np.float32(y0_reference_cell), self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch, hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch, land_mask_value) else: self.geostrophicBalanceKernel.prepared_async_call( self.global_size_geo_balance, self.local_size, stream, self.nx, self.ny, self.dx, self.dy, np.int32(ghost_cells_x), np.int32(ghost_cells_y), np.float32(g), np.float32(f), np.float32(beta), np.float32(y0_reference_cell), self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch, hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch, land_mask_value) def _obtain_coarse_grid_offset(self, fine_index_i, fine_index_j): default_offset = self.interpolation_factor // 2 offset_i, offset_j = 0, 0 if fine_index_i is not None: coarse_i = fine_index_i // self.interpolation_factor raw_offset_i = fine_index_i % self.interpolation_factor offset_i = -int(raw_offset_i - default_offset) if fine_index_j is not None: coarse_j = fine_index_j // self.interpolation_factor raw_offset_j = fine_index_j % self.interpolation_factor offset_j = -int(raw_offset_j - default_offset) return offset_i, offset_j def getRandomNorm(self): """ Calculates sum(xi^2), where xi \sim N(0,I) Calling a kernel that sums the square of all elements in the random buffer """ self.squareSumKernel.prepared_async_call( self.global_size_reductions, self.local_size_reductions, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.reduction_buffer.data.gpudata) return self.getReductionBuffer()[0, 0] def findDoubleNormAndDot(self): """ Calculates sum(xi^2), sum(nu^2), sum(xi*nu) and stores these values in the reduction buffer """ self.squareSumDoubleKernel.prepared_async_call( self.global_size_reductions, self.local_size_reductions, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.perpendicular_random_numbers.data.gpudata, self.reduction_buffer.data.gpudata) def _makePerpendicular(self): """ Calls the kernel that transform nu (perpendicular_random_numbers buffer) to be perpendicular to xi (random_numbers buffer). Both nu and xi should be independent samples from N(0,I) prior to calling this function. After this function, they are still both samples from N(0,I), but are no longer independent (but lineary independent). """ self.makePerpendicularKernel.prepared_async_call( self.global_size_perpendicular, self.local_size, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.random_numbers.pitch, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch, self.reduction_buffer.data.gpudata) def generatePerpendicularNormalDistributions(self): """ Generates xi, nu \sim N(0,I) such that xi and nu are perpendicular. In the process, it calculates sum(xi^2), sum(nu^2), which are written to the first two elements in the reduction buffer. The third reduction buffer will contain the original, now outdated, dot(xi, nu), which was used to construct a random nu that is perpendicular to xi in the first place. """ self.generateNormalDistribution() self.generateNormalDistributionPerpendicular() self.findDoubleNormAndDot() self._makePerpendicular() ##### CPU versions of the above functions #### def getSeedCPU(self): assert ( self.use_lcg ), "getSeedCPU is only valid if LCG is used as pseudo-random generator." return self.host_seed def generateNormalDistributionCPU(self): self._CPUUpdateRandom(True) def generateUniformDistributionCPU(self): self._CPUUpdateRandom(False) def getRandomNumbersCPU(self): return self.random_numbers_host def perturbEtaCPU(self, eta, use_existing_GPU_random_numbers=False, ghost_cells_x=0, ghost_cells_y=0): """ Apply the SOAR Q covariance matrix on the random field to add a perturbation to the incomming eta buffer. eta: numpy array """ # Call CPU utility function if use_existing_GPU_random_numbers: self.random_numbers_host = self.getRandomNumbers() else: self.generateNormalDistributionCPU() d_eta = self._applyQ_CPU() if self.interpolation_factor > 1: d_eta = self._interpolate_CPU(d_eta, geostrophic_balance=False) interior = [ -ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x ] for i in range(4): if interior[i] == 0: interior[i] = None eta[interior[2]:interior[0], interior[3]:interior[1]] = d_eta[2:-2, 2:-2] def perturbOceanStateCPU(self, eta, hu, hv, H, f, beta=0.0, g=9.81, ghost_cells_x=0, ghost_cells_y=0, use_existing_GPU_random_numbers=False, use_existing_CPU_random_numbers=False): """ Apply the SOAR Q covariance matrix on the random field to add a perturbation to the incomming eta buffer. Generate geostrophically balanced hu and hv which is added to the incomming hu and hv buffers. eta: numpy array """ # Call CPU utility function if use_existing_GPU_random_numbers: self.random_numbers_host = self.getRandomNumbers() elif not use_existing_CPU_random_numbers: self.generateNormalDistributionCPU() # generates perturbation (d_eta[ny+4, nx+4], d_hu[ny, nx] and d_hv[ny, nx]) d_eta, d_hu, d_hv = self._obtainOceanPerturbations_CPU(H, f, beta, g) interior = [ -ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x ] for i in range(4): if interior[i] == 0: interior[i] = None eta[interior[2]:interior[0], interior[3]:interior[1]] += d_eta[2:-2, 2:-2] hu[interior[2]:interior[0], interior[3]:interior[1]] += d_hu hv[interior[2]:interior[0], interior[3]:interior[1]] += d_hv # ------------------------------ # CPU utility functions: # ------------------------------ def _lcg(self, seed): modulo = np.uint64(2147483647) seed = np.uint64(((seed * 1103515245) + 12345) % modulo) #0x7fffffff return seed / 2147483648.0, seed def _boxMuller(self, seed_in): seed = np.uint64(seed_in) u1, seed = self._lcg(seed) u2, seed = self._lcg(seed) r = np.sqrt(-2.0 * np.log(u1)) theta = 2 * np.pi * u2 n1 = r * np.cos(theta) n2 = r * np.sin(theta) return n1, n2, seed def _CPUUpdateRandom(self, normalDist): """ Updating the random number buffer at the CPU. normalDist: Boolean parameter. If True, the random numbers are from N(0,1) If False, the random numbers are from U[0,1] """ if not self.use_lcg: if normalDist: self.generateNormalDistribution() else: self.generateUniformDistribution() self.random_numbers_host = self.getRandomNumbers() return #(ny, nx) = seed.shape #(domain_ny, domain_nx) = random.shape b_dim_x = self.local_size[0] b_dim_y = self.local_size[1] blocks_x = self.global_size_random_numbers[0] blocks_y = self.global_size_random_numbers[1] for by in range(blocks_y): for bx in range(blocks_x): for j in range(b_dim_y): for i in range(b_dim_x): ## Content of kernel: y = b_dim_y * by + j # thread_id x = b_dim_x * bx + i # thread_id if (x < self.seed_nx and y < self.seed_ny): n1, n2 = 0.0, 0.0 if normalDist: n1, n2, self.host_seed[y, x] = self._boxMuller( self.host_seed[y, x]) else: n1, self.host_seed[y, x] = self._lcg( self.host_seed[y, x]) n2, self.host_seed[y, x] = self._lcg( self.host_seed[y, x]) if x * 2 + 1 < self.rand_nx: self.random_numbers_host[y, x * 2] = n1 self.random_numbers_host[y, x * 2 + 1] = n2 elif x * 2 == self.rand_nx: self.random_numbers_host[y, x * 2] = n1 def _SOAR_Q_CPU(self, a_x, a_y, b_x, b_y): """ CPU implementation of a SOAR covariance function between grid points (a_x, a_y) and (b_x, b_y) """ dist = np.sqrt(self.coarse_dx * self.coarse_dx * (a_x - b_x)**2 + self.coarse_dy * self.coarse_dy * (a_y - b_y)**2) return self.soar_q0 * (1.0 + dist / self.soar_L) * np.exp( -dist / self.soar_L) def _applyQ_CPU(self, perturbation_scale=1): #xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5): """ Create the perturbation field for eta based on the SOAR covariance structure. The resulting size is (coarse_nx+4, coarse_ny+4), as two ghost cells are required to do bicubic interpolation of the result. """ # Assume in a GPU setting - we read xi into shared memory with ghostcells # Additional cutoff number of ghost cells required to calculate SOAR contribution ny_halo = int(self.coarse_ny + (2 + self.cutoff) * 2) nx_halo = int(self.coarse_nx + (2 + self.cutoff) * 2) local_xi = np.zeros((ny_halo, nx_halo)) for j in range(ny_halo): global_j = j if self.periodicNorthSouth: global_j = (j - self.cutoff - 2) % self.rand_ny for i in range(nx_halo): global_i = i if self.periodicEastWest: global_i = (i - self.cutoff - 2) % self.rand_nx local_xi[j, i] = self.random_numbers_host[global_j, global_i] # Sync threads # Allocate output buffer Qxi = np.zeros((self.coarse_ny + 4, self.coarse_nx + 4)) for a_y in range(self.coarse_ny + 4): for a_x in range(self.coarse_nx + 4): # This is a OpenCL thread (a_x, a_y) local_a_x = a_x + self.cutoff local_a_y = a_y + self.cutoff ############# #Qxi[a_y, a_x] = local_xi[local_a_y, local_a_x] #continue ############# start_b_y = local_a_y - self.cutoff end_b_y = local_a_y + self.cutoff + 1 start_b_x = local_a_x - self.cutoff end_b_x = local_a_x + self.cutoff + 1 Qx = 0.0 for b_y in range(start_b_y, end_b_y): for b_x in range(start_b_x, end_b_x): Q = self._SOAR_Q_CPU(local_a_x, local_a_y, b_x, b_y) Qx += Q * local_xi[b_y, b_x] Qxi[a_y, a_x] = perturbation_scale * Qx return Qxi def _obtainOceanPerturbations_CPU(self, H, f, beta, g, perturbation_scale=1): # Obtain perturbed eta - size (coarse_ny+4, coarse_nx+4) d_eta = self._applyQ_CPU(perturbation_scale) # Interpolate if the coarse grid is not the same as the computational grid # d_eta then becomes (ny+4, nx+4) if self.interpolation_factor > 1: d_eta = self._interpolate_CPU(d_eta) #### # Global sync (currently) # Can be made into a local sync, as long as d_eta is given # periodic overlap (1 more global computated ghost cell) #### d_hu = np.zeros((self.ny, self.nx)) d_hv = np.zeros((self.ny, self.nx)) ### Find H_mid: # Read global H (def on intersections) to local, find H_mid # The local memory can then be reused to something else (perhaps use local_d_eta before computing local_d_eta?) H_mid = np.zeros((self.ny, self.nx)) for j in range(self.ny): for i in range(self.nx): H_mid[j, i] = 0.25 * (H[j, i] + H[j + 1, i] + H[j, i + 1] + H[j + 1, i + 1]) #### # Local sync #### # Compute geostrophically balanced (hu, hv) for each cell within the domain for j in range(0, self.ny): local_j = j + 2 # index in d_eta buffer coriolis = f + beta * local_j * self.dy for i in range(0, self.nx): local_i = i + 2 # index in d_eta buffer h_mid = d_eta[local_j, local_i] + H_mid[j, i] ############## #h_mid = H_mid[j, i] ############## eta_diff_y = (d_eta[local_j + 1, local_i] - d_eta[local_j - 1, local_i]) / (2.0 * self.dy) d_hu[j, i] = -(g / coriolis) * h_mid * eta_diff_y eta_diff_x = (d_eta[local_j, local_i + 1] - d_eta[local_j, local_i - 1]) / (2.0 * self.dx) d_hv[j, i] = (g / coriolis) * h_mid * eta_diff_x return d_eta, d_hu, d_hv def _interpolate_CPU(self, coarse_eta, interpolation_order=3): """ Interpolates values coarse_eta defined on the coarse grid onto the computational grid. Input coarse_eta is of size [coarse_ny+4, coarse_nx+4], and output will be given as eta [ny+4, nx+4]. """ # Create buffers for eta, hu and hv: d_eta = np.zeros((self.ny + 4, self.nx + 4)) min_rel_x = 10 max_rel_x = -10 min_rel_y = 10 max_rel_y = -10 # Loop over internal cells and first ghost cell layer. for loc_j in range(self.ny + 2): for loc_i in range(self.nx + 2): # index in resulting d_eta buffer i = loc_i + 1 j = loc_j + 1 # Position of cell center in fine grid: x = (i - 2 + 0.5) * self.dx y = (j - 2 + 0.5) * self.dy # Location in coarse grid (defined in course grid's cell centers) # (coarse_i, coarse_j) is the first coarse grid point towards lower left. coarse_i = int(np.floor(x / self.coarse_dx + 2 - 0.5)) coarse_j = int(np.floor(y / self.coarse_dy + 2 - 0.5)) # Position of the coarse grid point coarse_x = (coarse_i - 2 + 0.5) * self.coarse_dx coarse_y = (coarse_j - 2 + 0.5) * self.coarse_dy assert coarse_x <= x assert coarse_x + self.coarse_dx >= x rel_x = (x - coarse_x) / self.coarse_dx rel_y = (y - coarse_y) / self.coarse_dy if rel_x < min_rel_x: min_rel_x = rel_x if rel_x > max_rel_x: max_rel_x = rel_x if rel_y < min_rel_y: min_rel_y = rel_y if rel_y > max_rel_y: max_rel_y = rel_y assert rel_x >= 0 and rel_x < 1 assert rel_y >= 0 and rel_y < 1 d_eta[j, i] = self._bicubic_interpolation_inner( coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order) return d_eta def _bicubic_interpolation_inner(self, coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order=3): # Matrix needed to find the interpolation coefficients bicubic_matrix = np.matrix([[1, 0, 0, 0], [0, 0, 1, 0], [-3, 3, -2, -1], [2, -2, 1, 1]]) f00 = coarse_eta[coarse_j, coarse_i] f01 = coarse_eta[coarse_j + 1, coarse_i] f10 = coarse_eta[coarse_j, coarse_i + 1] f11 = coarse_eta[coarse_j + 1, coarse_i + 1] fx00 = (coarse_eta[coarse_j, coarse_i + 1] - coarse_eta[coarse_j, coarse_i - 1]) / 2 fx01 = (coarse_eta[coarse_j + 1, coarse_i + 1] - coarse_eta[coarse_j + 1, coarse_i - 1]) / 2 fx10 = (coarse_eta[coarse_j, coarse_i + 2] - coarse_eta[coarse_j, coarse_i]) / 2 fx11 = (coarse_eta[coarse_j + 1, coarse_i + 2] - coarse_eta[coarse_j + 1, coarse_i]) / 2 fy00 = (coarse_eta[coarse_j + 1, coarse_i] - coarse_eta[coarse_j - 1, coarse_i]) / 2 fy01 = (coarse_eta[coarse_j + 2, coarse_i] - coarse_eta[coarse_j, coarse_i]) / 2 fy10 = (coarse_eta[coarse_j + 1, coarse_i + 1] - coarse_eta[coarse_j - 1, coarse_i + 1]) / 2 fy11 = (coarse_eta[coarse_j + 2, coarse_i + 1] - coarse_eta[coarse_j, coarse_i + 1]) / 2 fy_10 = (coarse_eta[coarse_j + 1, coarse_i - 1] - coarse_eta[coarse_j - 1, coarse_i - 1]) / 2 fy_11 = (coarse_eta[coarse_j + 2, coarse_i - 1] - coarse_eta[coarse_j, coarse_i - 1]) / 2 fy20 = (coarse_eta[coarse_j + 1, coarse_i + 2] - coarse_eta[coarse_j - 1, coarse_i + 2]) / 2 fy21 = (coarse_eta[coarse_j + 2, coarse_i + 2] - coarse_eta[coarse_j, coarse_i + 2]) / 2 fxy00 = (fy10 - fy_10) / 2 fxy01 = (fy11 - fy_11) / 2 fxy10 = (fy20 - fy00) / 2 fxy11 = (fy21 - fy01) / 2 f_matrix = np.matrix([[f00, f01, fy00, fy01], [f10, f11, fy10, fy11], [fx00, fx01, fxy00, fxy01], [fx10, fx11, fxy10, fxy11]]) a_matrix = np.dot(bicubic_matrix, np.dot(f_matrix, bicubic_matrix.transpose())) x_vec = np.matrix([1.0, rel_x, rel_x * rel_x, rel_x * rel_x * rel_x]) y_vec = np.matrix([1.0, rel_y, rel_y * rel_y, rel_y * rel_y * rel_y]).transpose() if interpolation_order == 0: # Flat average: return 0.25 * (f00 + f01 + f10 + f11) elif interpolation_order == 1: # Linear interpolation: return f00 * (1 - rel_x) * (1 - rel_y) + f10 * rel_x * ( 1 - rel_y) + f01 * (1 - rel_x) * rel_y + f11 * rel_x * rel_y elif interpolation_order == 3: # Bicubic interpolation (make sure that we return a float) return np.dot(x_vec, np.dot(a_matrix, y_vec))[0, 0]
class OceanStateNoise(object): """ Generating random perturbations for a ocean state. Perturbation for the surface field, dEta, is produced with a covariance structure according to a SOAR function, while dHu and dHv are found by the geostrophic balance to avoid shock solutions. """ def __init__(self, gpu_ctx, gpu_stream, nx, ny, dx, dy, boundaryConditions, staggered, soar_q0=None, soar_L=None, interpolation_factor = 1, use_lcg=False, block_width=16, block_height=16): """ Initiates a class that generates small scale geostrophically balanced perturbations of the ocean state. (nx, ny): number of internal grid cells in the domain (dx, dy): size of each grid cell soar_q0: amplitude parameter for the perturbation, default: dx*1e-5 soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, and then interpolated down to the computational mesh. The coarse mesh will then have (nx/interpolation_factor, ny/interpolation_factor) grid cells. use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers (block_width, block_height): The size of each GPU block """ self.use_lcg = use_lcg # Set numpy random state self.random_state = np.random.RandomState() # Make sure that all variables initialized within ifs are defined self.random_numbers = None self.rng = None self.seed = None self.host_seed = None self.gpu_ctx = gpu_ctx self.gpu_stream = gpu_stream self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) self.dy = np.float32(dy) self.staggered = np.int(0) if staggered: self.staggered = np.int(1) # The cutoff parameter is hard-coded. # The size of the cutoff determines the computational radius in the # SOAR function. Hence, the size of the local memory in the OpenCL # kernels has to be hard-coded. self.cutoff = np.int32(config.soar_cutoff) # Check that the interpolation factor plays well with the grid size: assert ( interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer' assert (nx % interpolation_factor == 0), 'nx must be divisible by the interpolation factor' assert (ny % interpolation_factor == 0), 'ny must be divisible by the interpolation factor' self.interpolation_factor = np.int32(interpolation_factor) # The size of the coarse grid self.coarse_nx = np.int32(nx/self.interpolation_factor) self.coarse_ny = np.int32(ny/self.interpolation_factor) self.coarse_dx = np.float32(dx*self.interpolation_factor) self.coarse_dy = np.float32(dy*self.interpolation_factor) self.periodicNorthSouth = np.int32(boundaryConditions.isPeriodicNorthSouth()) self.periodicEastWest = np.int32(boundaryConditions.isPeriodicEastWest()) # Size of random field and seed # The SOAR function is a stencil which requires cutoff number of grid cells, # and the interpolation operator requires further 2 ghost cell values in each direction. # The random field must therefore be created with 2 + cutoff number of ghost cells. self.rand_ghost_cells_x = np.int32(2+self.cutoff) self.rand_ghost_cells_y = np.int32(2+self.cutoff) if self.periodicEastWest: self.rand_ghost_cells_x = np.int32(0) if self.periodicNorthSouth: self.rand_ghost_cells_y = np.int32(0) self.rand_nx = np.int32(self.coarse_nx + 2*self.rand_ghost_cells_x) self.rand_ny = np.int32(self.coarse_ny + 2*self.rand_ghost_cells_y) # Since normal distributed numbers are generated in pairs, we need to store half the number of # of seed values compared to the number of random numbers. self.seed_ny = np.int32(self.rand_ny) self.seed_nx = np.int32(np.ceil(self.rand_nx/2)) # Generate seed: self.floatMax = 2147483648.0 if self.use_lcg: self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx)*self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') if not self.use_lcg: self.rng = XORWOWRandomNumberGenerator() else: self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True) # Constants for the SOAR function: self.soar_q0 = np.float32(self.dx/100000) if soar_q0 is not None: self.soar_q0 = np.float32(soar_q0) self.soar_L = np.float32(0.75*self.coarse_dx) if soar_L is not None: self.soar_L = np.float32(soar_L) # Allocate memory for random numbers (xi) self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate a second buffer for random numbers (nu) self.perpendicular_random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.perpendicular_random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate memory for coarse buffer if needed # Two ghost cells in each direction needed for bicubic interpolation self.coarse_buffer_host = np.zeros((self.coarse_ny+4, self.coarse_nx+4), dtype=np.float32, order='C') self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host) # Allocate extra memory needed for reduction kernels. # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu] self.reduction_buffer = None reduction_buffer_host = np.zeros((1,3), dtype=np.float32) self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host) # Generate kernels self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \ defines={'block_width': block_width, 'block_height': block_height}) self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \ defines={}) # Get CUDA functions and define data types for prepared_{async_}call() # Generate kernels self.squareSumKernel = self.reduction_kernels.get_function("squareSum") self.squareSumKernel.prepare("iiPP") self.squareSumDoubleKernel = self.reduction_kernels.get_function("squareSumDouble") self.squareSumDoubleKernel.prepare("iiPPP") self.makePerpendicularKernel = self.kernels.get_function("makePerpendicular") self.makePerpendicularKernel.prepare("iiPiPiP") self.uniformDistributionKernel = self.kernels.get_function("uniformDistribution") self.uniformDistributionKernel.prepare("iiiPiPi") self.normalDistributionKernel = None if self.use_lcg: self.normalDistributionKernel = self.kernels.get_function("normalDistribution") self.normalDistributionKernel.prepare("iiiPiPi") self.soarKernel = self.kernels.get_function("SOAR") self.soarKernel.prepare("iifffffiiPiPii") self.geostrophicBalanceKernel = self.kernels.get_function("geostrophicBalance") self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPi") self.bicubicInterpolationKernel = self.kernels.get_function("bicubicInterpolation") self.bicubicInterpolationKernel.prepare("iiiiffiiiiffiiffffPiPiPiPiPi") #Compute kernel launch parameters self.local_size = (block_width, block_height, 1) self.local_size_reductions = (128, 1, 1) self.global_size_reductions = (1, 1) # Launch one thread for each seed, which in turns generates two iid N(0,1) self.global_size_random_numbers = ( \ int(np.ceil(self.seed_nx / float(self.local_size[0]))), \ int(np.ceil(self.seed_ny / float(self.local_size[1]))) \ ) # Launch on thread for each random number (in order to create perpendicular random numbers) self.global_size_perpendicular = ( \ int(np.ceil(self.rand_nx / float(self.local_size[0]))), \ int(np.ceil(self.rand_ny / float(self.local_size[1]))) \ ) # Launch one thread per SOAR-correlated result - need to write to two ghost # cells in order to do bicubic interpolation based on the result self.global_size_SOAR = ( \ int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \ int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \ ) # One thread per resulting perturbed grid cell self.global_size_geo_balance = ( \ int(np.ceil( (self.nx)/float(self.local_size[0]))), \ int(np.ceil( (self.ny)/float(self.local_size[1]))) \ ) def __del__(self): self.cleanUp() def cleanUp(self): if self.rng is not None: self.rng = None if self.seed is not None: self.seed.release() if self.random_numbers is not None: self.random_numbers.release() if self.perpendicular_random_numbers is not None: self.perpendicular_random_numbers.release() if self.reduction_buffer is not None: self.reduction_buffer.release() self.gpu_ctx = None gc.collect() @classmethod def fromsim(cls, sim, soar_q0=None, soar_L=None, interpolation_factor=1, block_width=16, block_height=16): staggered = False if isinstance(sim, FBL.FBL) or isinstance(sim, CTCS.CTCS): staggered = True return cls(sim.gpu_ctx, sim.gpu_stream, sim.nx, sim.ny, sim.dx, sim.dy, sim.boundary_conditions, staggered, soar_q0=soar_q0, soar_L=soar_L, interpolation_factor=interpolation_factor, block_width=block_width, block_height=block_height) def getSeed(self): assert(self.use_lcg), "getSeed is only valid if LCG is used as pseudo-random generator." return self.seed.download(self.gpu_stream) def resetSeed(self): assert(self.use_lcg), "resetSeed is only valid if LCG is used as pseudo-random generator." # Generate seed: self.floatMax = 2147483648.0 self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx)*self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') self.seed.upload(self.gpu_stream, self.host_seed) def getRandomNumbers(self): return self.random_numbers.download(self.gpu_stream) def getPerpendicularRandomNumbers(self): return self.perpendicular_random_numbers.download(self.gpu_stream) def getCoarseBuffer(self): return self.coarse_buffer.download(self.gpu_stream) def getReductionBuffer(self): return self.reduction_buffer.download(self.gpu_stream) def generateNormalDistribution(self): if not self.use_lcg: self.rng.fill_normal(self.random_numbers.data, stream=self.gpu_stream) else: self.normalDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.random_numbers.data.gpudata, self.random_numbers.pitch) def generateNormalDistributionPerpendicular(self): if not self.use_lcg: self.rng.fill_normal(self.perpendicular_random_numbers.data, stream=self.gpu_stream) else: self.normalDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch) def generateUniformDistribution(self): # Call kernel -> new random numbers if not self.use_lcg: self.rng.fill_uniform(self.random_numbers.data, stream=self.gpu_stream) else: self.uniformDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx, self.seed.data.gpudata, self.seed.pitch, self.random_numbers.data.gpudata, self.random_numbers.pitch) def perturbSim(self, sim, q0_scale=1.0, update_random_field=True, perturbation_scale=1.0, perpendicular_scale=0.0, align_with_cell_i=None, align_with_cell_j=None): """ Generating a perturbed ocean state and adding it to sim's ocean state """ self.perturbOceanState(sim.gpu_data.h0, sim.gpu_data.hu0, sim.gpu_data.hv0, sim.bathymetry.Bi, sim.f, beta=sim.coriolis_beta, g=sim.g, y0_reference_cell=sim.y_zero_reference_cell, ghost_cells_x=sim.ghost_cells_x, ghost_cells_y=sim.ghost_cells_y, q0_scale=q0_scale, update_random_field=update_random_field, perturbation_scale=perturbation_scale, perpendicular_scale=perpendicular_scale, align_with_cell_i=align_with_cell_i, align_with_cell_j=align_with_cell_j) def perturbOceanState(self, eta, hu, hv, H, f, beta=0.0, g=9.81, y0_reference_cell=0, ghost_cells_x=0, ghost_cells_y=0, q0_scale=1.0, update_random_field=True, perturbation_scale=1.0, perpendicular_scale=0.0, align_with_cell_i=None, align_with_cell_j=None): """ Apply the SOAR Q covariance matrix on the random ocean field which is added to the provided buffers eta, hu and hv. eta: surface deviation - CUDAArray2D object. hu: volume transport in x-direction - CUDAArray2D object. hv: volume transport in y-dirextion - CUDAArray2D object. Optional parameters not used else_where: q0_scale=1: scale factor to the SOAR amplitude parameter q0 update_random_field=True: whether to generate new random numbers or use those already present in the random numbers buffer perturbation_scale=1.0: scale factor to the perturbation of the eta field perpendicular_scale=0.0: scale factor for additional perturbation from the perpendicular random field align_with_cell_i=None, align_with_cell_j=None: Index to a cell for which to align the coarse grid. The default value align_with_cell=None corresponds to zero offset between the coarse and fine grid. """ if update_random_field: # Need to update the random field, requiering a global sync self.generateNormalDistribution() soar_q0 = np.float32(self.soar_q0 * q0_scale) offset_i, offset_j = self._obtain_coarse_grid_offset(align_with_cell_i, align_with_cell_j) # Generate the SOAR field on the coarse grid self.soarKernel.prepared_async_call(self.global_size_SOAR, self.local_size, self.gpu_stream, self.coarse_nx, self.coarse_ny, self.coarse_dx, self.coarse_dy, soar_q0, self.soar_L, np.float32(perturbation_scale), self.periodicNorthSouth, self.periodicEastWest, self.random_numbers.data.gpudata, self.random_numbers.pitch, self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, np.int32(0)) if perpendicular_scale > 0: self.soarKernel.prepared_async_call(self.global_size_SOAR, self.local_size, self.gpu_stream, self.coarse_nx, self.coarse_ny, self.coarse_dx, self.coarse_dy, soar_q0, self.soar_L, np.float32(perpendicular_scale), self.periodicNorthSouth, self.periodicEastWest, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch, self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, np.int32(1)) if self.interpolation_factor > 1: self.bicubicInterpolationKernel.prepared_async_call(self.global_size_geo_balance, self.local_size, self.gpu_stream, self.nx, self.ny, np.int32(ghost_cells_x), np.int32(ghost_cells_y), self.dx, self.dy, self.coarse_nx, self.coarse_ny, np.int32(ghost_cells_x), np.int32(ghost_cells_y), self.coarse_dx, self.coarse_dy, np.int32(offset_i), np.int32(offset_j), np.float32(g), np.float32(f), np.float32(beta), np.float32(y0_reference_cell), self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch, hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch) else: self.geostrophicBalanceKernel.prepared_async_call(self.global_size_geo_balance, self.local_size, self.gpu_stream, self.nx, self.ny, self.dx, self.dy, np.int32(ghost_cells_x), np.int32(ghost_cells_y), np.float32(g), np.float32(f), np.float32(beta), np.float32(y0_reference_cell), self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch, eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch, hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch) def _obtain_coarse_grid_offset(self, fine_index_i, fine_index_j): default_offset = self.interpolation_factor//2 offset_i, offset_j = 0, 0 if fine_index_i is not None: coarse_i = fine_index_i//self.interpolation_factor raw_offset_i = fine_index_i % self.interpolation_factor offset_i = -int(raw_offset_i - default_offset) if fine_index_j is not None: coarse_j = fine_index_j//self.interpolation_factor raw_offset_j = fine_index_j % self.interpolation_factor offset_j = -int(raw_offset_j - default_offset) return offset_i, offset_j def getRandomNorm(self): """ Calculates sum(xi^2), where xi \sim N(0,I) Calling a kernel that sums the square of all elements in the random buffer """ self.squareSumKernel.prepared_async_call(self.global_size_reductions, self.local_size_reductions, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.reduction_buffer.data.gpudata) return self.getReductionBuffer()[0,0] def findDoubleNormAndDot(self): """ Calculates sum(xi^2), sum(nu^2), sum(xi*nu) and stores these values in the reduction buffer """ self.squareSumDoubleKernel.prepared_async_call(self.global_size_reductions, self.local_size_reductions, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.perpendicular_random_numbers.data.gpudata, self.reduction_buffer.data.gpudata) def _makePerpendicular(self): """ Calls the kernel that transform nu (perpendicular_random_numbers buffer) to be perpendicular to xi (random_numbers buffer). Both nu and xi should be independent samples from N(0,I) prior to calling this function. After this function, they are still both samples from N(0,I), but are no longer independent (but lineary independent). """ self.makePerpendicularKernel.prepared_async_call(self.global_size_perpendicular, self.local_size, self.gpu_stream, self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata, self.random_numbers.pitch, self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch, self.reduction_buffer.data.gpudata) def generatePerpendicularNormalDistributions(self): """ Generates xi, nu \sim N(0,I) such that xi and nu are perpendicular. In the process, it calculates sum(xi^2), sum(nu^2), which are written to the first two elements in the reduction buffer. The third reduction buffer will contain the original, now outdated, dot(xi, nu), which was used to construct a random nu that is perpendicular to xi in the first place. """ self.generateNormalDistribution() self.generateNormalDistributionPerpendicular() self.findDoubleNormAndDot() self._makePerpendicular() ##### CPU versions of the above functions #### def getSeedCPU(self): assert(self.use_lcg), "getSeedCPU is only valid if LCG is used as pseudo-random generator." return self.host_seed def generateNormalDistributionCPU(self): self._CPUUpdateRandom(True) def generateUniformDistributionCPU(self): self._CPUUpdateRandom(False) def getRandomNumbersCPU(self): return self.random_numbers_host def perturbEtaCPU(self, eta, use_existing_GPU_random_numbers=False, ghost_cells_x=0, ghost_cells_y=0): """ Apply the SOAR Q covariance matrix on the random field to add a perturbation to the incomming eta buffer. eta: numpy array """ # Call CPU utility function if use_existing_GPU_random_numbers: self.random_numbers_host = self.getRandomNumbers() else: self.generateNormalDistributionCPU() d_eta = self._applyQ_CPU() if self.interpolation_factor > 1: d_eta = self._interpolate_CPU(d_eta, geostrophic_balance=False) interior = [-ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x] for i in range(4): if interior[i] == 0: interior[i] = None eta[interior[2]:interior[0], interior[3]:interior[1]] = d_eta[2:-2, 2:-2] def perturbOceanStateCPU(self, eta, hu, hv, H, f, beta=0.0, g=9.81, ghost_cells_x=0, ghost_cells_y=0, use_existing_GPU_random_numbers=False, use_existing_CPU_random_numbers=False): """ Apply the SOAR Q covariance matrix on the random field to add a perturbation to the incomming eta buffer. Generate geostrophically balanced hu and hv which is added to the incomming hu and hv buffers. eta: numpy array """ # Call CPU utility function if use_existing_GPU_random_numbers: self.random_numbers_host = self.getRandomNumbers() elif not use_existing_CPU_random_numbers: self.generateNormalDistributionCPU() # generates perturbation (d_eta[ny+4, nx+4], d_hu[ny, nx] and d_hv[ny, nx]) d_eta, d_hu, d_hv = self._obtainOceanPerturbations_CPU(H, f, beta, g) interior = [-ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x] for i in range(4): if interior[i] == 0: interior[i] = None eta[interior[2]:interior[0], interior[3]:interior[1]] += d_eta[2:-2, 2:-2] hu[interior[2]:interior[0], interior[3]:interior[1]] += d_hu hv[interior[2]:interior[0], interior[3]:interior[1]] += d_hv # ------------------------------ # CPU utility functions: # ------------------------------ def _lcg(self, seed): modulo = np.uint64(2147483647) seed = np.uint64(((seed*1103515245) + 12345) % modulo) #0x7fffffff return seed / 2147483648.0, seed def _boxMuller(self, seed_in): seed = np.uint64(seed_in) u1, seed = self._lcg(seed) u2, seed = self._lcg(seed) r = np.sqrt(-2.0*np.log(u1)) theta = 2*np.pi*u2 n1 = r*np.cos(theta) n2 = r*np.sin(theta) return n1, n2, seed def _CPUUpdateRandom(self, normalDist): """ Updating the random number buffer at the CPU. normalDist: Boolean parameter. If True, the random numbers are from N(0,1) If False, the random numbers are from U[0,1] """ if not self.use_lcg: if normalDist: self.generateNormalDistribution() else: self.generateUniformDistribution() self.random_numbers_host = self.getRandomNumbers() return #(ny, nx) = seed.shape #(domain_ny, domain_nx) = random.shape b_dim_x = self.local_size[0] b_dim_y = self.local_size[1] blocks_x = self.global_size_random_numbers[0] blocks_y = self.global_size_random_numbers[1] for by in range(blocks_y): for bx in range(blocks_x): for j in range(b_dim_y): for i in range(b_dim_x): ## Content of kernel: y = b_dim_y*by + j # thread_id x = b_dim_x*bx + i # thread_id if (x < self.seed_nx and y < self.seed_ny): n1, n2 = 0.0, 0.0 if normalDist: n1, n2, self.host_seed[y,x] = self._boxMuller(self.host_seed[y,x]) else: n1, self.host_seed[y,x] = self._lcg(self.host_seed[y,x]) n2, self.host_seed[y,x] = self._lcg(self.host_seed[y,x]) if x*2 + 1 < self.rand_nx: self.random_numbers_host[y, x*2 ] = n1 self.random_numbers_host[y, x*2+1] = n2 elif x*2 == self.rand_nx: self.random_numbers_host[y, x*2] = n1 def _SOAR_Q_CPU(self, a_x, a_y, b_x, b_y): """ CPU implementation of a SOAR covariance function between grid points (a_x, a_y) and (b_x, b_y) """ dist = np.sqrt( self.coarse_dx*self.coarse_dx*(a_x - b_x)**2 + self.coarse_dy*self.coarse_dy*(a_y - b_y)**2 ) return self.soar_q0*(1.0 + dist/self.soar_L)*np.exp(-dist/self.soar_L) def _applyQ_CPU(self, perturbation_scale=1): #xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5): """ Create the perturbation field for eta based on the SOAR covariance structure. The resulting size is (coarse_nx+4, coarse_ny+4), as two ghost cells are required to do bicubic interpolation of the result. """ # Assume in a GPU setting - we read xi into shared memory with ghostcells # Additional cutoff number of ghost cells required to calculate SOAR contribution ny_halo = int(self.coarse_ny + (2 + self.cutoff)*2) nx_halo = int(self.coarse_nx + (2 + self.cutoff)*2) local_xi = np.zeros((ny_halo, nx_halo)) for j in range(ny_halo): global_j = j if self.periodicNorthSouth: global_j = (j - self.cutoff - 2) % self.rand_ny for i in range(nx_halo): global_i = i if self.periodicEastWest: global_i = (i - self.cutoff - 2) % self.rand_nx local_xi[j,i] = self.random_numbers_host[global_j, global_i] # Sync threads # Allocate output buffer Qxi = np.zeros((self.coarse_ny+4, self.coarse_nx+4)) for a_y in range(self.coarse_ny+4): for a_x in range(self.coarse_nx+4): # This is a OpenCL thread (a_x, a_y) local_a_x = a_x + self.cutoff local_a_y = a_y + self.cutoff ############# #Qxi[a_y, a_x] = local_xi[local_a_y, local_a_x] #continue ############# start_b_y = local_a_y - self.cutoff end_b_y = local_a_y + self.cutoff+1 start_b_x = local_a_x - self.cutoff end_b_x = local_a_x + self.cutoff+1 Qx = 0.0 for b_y in range(start_b_y, end_b_y): for b_x in range(start_b_x, end_b_x): Q = self._SOAR_Q_CPU(local_a_x, local_a_y, b_x, b_y) Qx += Q*local_xi[b_y, b_x] Qxi[a_y, a_x] = perturbation_scale*Qx return Qxi def _obtainOceanPerturbations_CPU(self, H, f, beta, g, perturbation_scale=1): # Obtain perturbed eta - size (coarse_ny+4, coarse_nx+4) d_eta = self._applyQ_CPU(perturbation_scale) # Interpolate if the coarse grid is not the same as the computational grid # d_eta then becomes (ny+4, nx+4) if self.interpolation_factor > 1: d_eta = self._interpolate_CPU(d_eta) #### # Global sync (currently) # Can be made into a local sync, as long as d_eta is given # periodic overlap (1 more global computated ghost cell) #### d_hu = np.zeros((self.ny, self.nx)) d_hv = np.zeros((self.ny, self.nx)) ### Find H_mid: # Read global H (def on intersections) to local, find H_mid # The local memory can then be reused to something else (perhaps use local_d_eta before computing local_d_eta?) H_mid = np.zeros((self.ny, self.nx)) for j in range(self.ny): for i in range(self.nx): H_mid[j,i] = 0.25* (H[j,i] + H[j+1, i] + H[j, i+1] + H[j+1, i+1]) #### # Local sync #### # Compute geostrophically balanced (hu, hv) for each cell within the domain for j in range(0, self.ny): local_j = j + 2 # index in d_eta buffer coriolis = f + beta*local_j*self.dy for i in range(0, self.nx): local_i = i + 2 # index in d_eta buffer h_mid = d_eta[local_j,local_i] + H_mid[j, i] ############## #h_mid = H_mid[j, i] ############## eta_diff_y = (d_eta[local_j+1, local_i] - d_eta[local_j-1, local_i])/(2.0*self.dy) d_hu[j,i] = -(g/coriolis)*h_mid*eta_diff_y eta_diff_x = (d_eta[local_j, local_i+1] - d_eta[local_j, local_i-1])/(2.0*self.dx) d_hv[j,i] = (g/coriolis)*h_mid*eta_diff_x return d_eta, d_hu, d_hv def _interpolate_CPU(self, coarse_eta, interpolation_order=3): """ Interpolates values coarse_eta defined on the coarse grid onto the computational grid. Input coarse_eta is of size [coarse_ny+4, coarse_nx+4], and output will be given as eta [ny+4, nx+4]. """ # Create buffers for eta, hu and hv: d_eta = np.zeros((self.ny+4, self.nx+4)) min_rel_x = 10 max_rel_x = -10 min_rel_y = 10 max_rel_y = -10 # Loop over internal cells and first ghost cell layer. for loc_j in range(self.ny+2): for loc_i in range(self.nx+2): # index in resulting d_eta buffer i = loc_i + 1 j = loc_j + 1 # Position of cell center in fine grid: x = (i - 2 + 0.5)*self.dx y = (j - 2 + 0.5)*self.dy # Location in coarse grid (defined in course grid's cell centers) # (coarse_i, coarse_j) is the first coarse grid point towards lower left. coarse_i = int(np.floor(x/self.coarse_dx + 2 - 0.5)) coarse_j = int(np.floor(y/self.coarse_dy + 2 - 0.5)) # Position of the coarse grid point coarse_x = (coarse_i - 2 + 0.5)*self.coarse_dx coarse_y = (coarse_j - 2 + 0.5)*self.coarse_dy assert coarse_x <= x assert coarse_x + self.coarse_dx >= x rel_x = (x - coarse_x)/self.coarse_dx rel_y = (y - coarse_y)/self.coarse_dy if rel_x < min_rel_x: min_rel_x = rel_x if rel_x > max_rel_x: max_rel_x = rel_x if rel_y < min_rel_y: min_rel_y = rel_y if rel_y > max_rel_y: max_rel_y = rel_y assert rel_x >= 0 and rel_x < 1 assert rel_y >= 0 and rel_y < 1 d_eta[j,i] = self._bicubic_interpolation_inner(coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order) return d_eta def _bicubic_interpolation_inner(self, coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order=3): # Matrix needed to find the interpolation coefficients bicubic_matrix = np.matrix([[ 1, 0, 0, 0], [ 0, 0, 1, 0], [-3, 3, -2, -1], [ 2, -2, 1, 1]]) f00 = coarse_eta[coarse_j , coarse_i ] f01 = coarse_eta[coarse_j+1, coarse_i ] f10 = coarse_eta[coarse_j , coarse_i+1] f11 = coarse_eta[coarse_j+1, coarse_i+1] fx00 = (coarse_eta[coarse_j , coarse_i+1] - coarse_eta[coarse_j , coarse_i-1])/2 fx01 = (coarse_eta[coarse_j+1, coarse_i+1] - coarse_eta[coarse_j+1, coarse_i-1])/2 fx10 = (coarse_eta[coarse_j , coarse_i+2] - coarse_eta[coarse_j , coarse_i ])/2 fx11 = (coarse_eta[coarse_j+1, coarse_i+2] - coarse_eta[coarse_j+1, coarse_i ])/2 fy00 = (coarse_eta[coarse_j+1, coarse_i ] - coarse_eta[coarse_j-1, coarse_i ])/2 fy01 = (coarse_eta[coarse_j+2, coarse_i ] - coarse_eta[coarse_j , coarse_i ])/2 fy10 = (coarse_eta[coarse_j+1, coarse_i+1] - coarse_eta[coarse_j-1, coarse_i+1])/2 fy11 = (coarse_eta[coarse_j+2, coarse_i+1] - coarse_eta[coarse_j , coarse_i+1])/2 fy_10 = (coarse_eta[coarse_j+1, coarse_i-1] - coarse_eta[coarse_j-1, coarse_i-1])/2 fy_11 = (coarse_eta[coarse_j+2, coarse_i-1] - coarse_eta[coarse_j , coarse_i-1])/2 fy20 = (coarse_eta[coarse_j+1, coarse_i+2] - coarse_eta[coarse_j-1, coarse_i+2])/2 fy21 = (coarse_eta[coarse_j+2, coarse_i+2] - coarse_eta[coarse_j , coarse_i+2])/2 fxy00 = (fy10 - fy_10)/2 fxy01 = (fy11 - fy_11)/2 fxy10 = (fy20 - fy00)/2 fxy11 = (fy21 - fy01)/2 f_matrix = np.matrix([[ f00, f01, fy00, fy01], [ f10, f11, fy10, fy11], [fx00, fx01, fxy00, fxy01], [fx10, fx11, fxy10, fxy11] ]) a_matrix = np.dot(bicubic_matrix, np.dot(f_matrix, bicubic_matrix.transpose())) x_vec = np.matrix([1.0, rel_x, rel_x*rel_x, rel_x*rel_x*rel_x]) y_vec = np.matrix([1.0, rel_y, rel_y*rel_y, rel_y*rel_y*rel_y]).transpose() if interpolation_order == 0: # Flat average: return 0.25*(f00 + f01 + f10 + f11) elif interpolation_order == 1: # Linear interpolation: return f00*(1-rel_x)*(1-rel_y) + f10*rel_x*(1-rel_y) + f01*(1-rel_x)*rel_y + f11*rel_x*rel_y elif interpolation_order == 3: # Bicubic interpolation (make sure that we return a float) return np.dot(x_vec, np.dot(a_matrix, y_vec))[0, 0]
class PyCudaHandler(Handler): __undescribed__ = {'context', 'dtype', 'EMPTY', 'rnd'} def __init__(self, seed=None): super(PyCudaHandler, self).__init__() self.dtype = np.float32 self.context = cumisc._global_cublas_handle self.EMPTY = gpuarray.zeros((), dtype=self.dtype) if seed is None: seed = global_rnd.generate_seed() def get_seeds(n): return gpuarray.to_gpu(np.ones(n, np.int32) * seed) self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds) array_type = pycuda.gpuarray.GPUArray def __init_from_description__(self, description): self.__init__() def _get_gridsize(self, n): min_threads = 32 max_threads = 256 max_blocks = 384 if n < min_threads: block_count = 1 threads_per_block = min_threads elif n < (max_blocks * min_threads): block_count = (n + min_threads - 1) // min_threads threads_per_block = min_threads elif n < (max_blocks * max_threads): block_count = max_blocks grp = (n + min_threads - 1) // min_threads threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads else: block_count = max_blocks threads_per_block = max_threads return (block_count, 1), (threads_per_block, 1, 1) # ------------------------- Allocate new memory ------------------------- # def allocate(self, size): return gpuarray.zeros(size, dtype=self.dtype) def ones(self, shape): a = self.zeros(shape) self.fill(a, 1.0) return a def zeros(self, shape): return gpuarray.zeros(shape=shape, dtype=self.dtype) # ---------------------------- Copy and Fill ---------------------------- # def copy_to(self, src, dest): # Copy data from src to dest (both must be GPUArrays) pycuda.driver.memcpy_dtod(dest.gpudata, src.gpudata, dest.nbytes) def copy_to_if(self, src, dest, cond): copy_to_if_kernel(src, dest, cond) def create_from_numpy(self, arr): return gpuarray.to_gpu(arr.astype(self.dtype)) def fill(self, mem, val): mem.fill(val) def fill_if(self, mem, val, cond): fill_if_kernel(mem, val, cond) def get_numpy_copy(self, mem): assert type(mem) == self.array_type return mem.get() def set_from_numpy(self, mem, arr): assert mem.shape == arr.shape, "Shape of destination ({}) != Shape " \ "of source ({})".format(mem.shape, arr.shape) mem.set(arr.astype(self.dtype)) # ---------------------------- Debug helpers ---------------------------- # def is_fully_finite(self, a): temp = gpuarray.zeros_like(a) check_inf_or_nan_kernel(a, temp) return not np.any(temp.get()) # ----------------------- Mathematical operations ----------------------- # def abs_t(self, a, out): cumath.fabs(a, out=out) def add_into_if(self, a, out, cond): add_into_if_kernel(a, out, cond) def add_mv(self, m, v, out): cumisc.add_matvec(m, v, out=out) def add_st(self, s, t, out): add_st_kernel(s, t, out) def add_tt(self, a, b, out): add_mm_kernel(a, b, out) def avgpool2d_backward_batch(self, inputs, window, outputs, padding, stride, in_deltas, out_deltas): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _avepool_bwd_fp32_impl(np.int32(inputs.size), out_deltas, np.int32(n), np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), in_deltas, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(inputs.size), 1)) def avgpool2d_forward_batch(self, inputs, window, outputs, padding, stride): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _avepool_fwd_fp32_impl(np.int32(outputs.size), inputs, np.int32(n), np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), outputs, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(outputs.size), 1)) def binarize_v(self, v, out): binarize_v_kernel(out, v, out.shape[0], out.shape[1]) def broadcast_t(self, a, axis, out): broadcast_dim = int(out.shape[axis]) stride = int(np.prod(out.shape[axis+1:])) broadcast_t_kernel(out, a, broadcast_dim, stride) def clip_t(self, a, a_min, a_max, out): clip_kernel(a, out, a_min, a_max) def conv2d_backward_batch(self, inputs, params, padding, stride, in_deltas, out_deltas, dparams, dbias): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = out_deltas.shape[1] * out_deltas.shape[2] num_kernel_params = np.prod(kernel_shape) dparams.fill(0.0) dbias.fill(0.0) tmp = self.zeros(dbias.shape) col = self.zeros((num_output_pixels, num_kernel_params)) for i in range(num_images): num_cuda_kernels = num_output_pixels * num_input_maps _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(out_deltas.shape[2]), np.int32(num_input_maps), col.gpudata, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(num_cuda_kernels), 1)) # Compute gradients reshaped_dparams = dparams.reshape(num_filters, num_kernel_params) reshaped_out_deltas = out_deltas[i].reshape((num_output_pixels, num_filters)) self.dot_add_mm(reshaped_out_deltas, col, out=reshaped_dparams, transa=True) self.sum_t(reshaped_out_deltas, axis=0, out=tmp) self.add_tt(tmp, dbias, out=dbias) # Compute in_deltas reshaped_params = params.reshape((num_filters, num_kernel_params)) self.dot_mm(reshaped_out_deltas, reshaped_params, out=col) num_cuda_kernels = input_rows * input_cols * num_input_maps _col2im_fp32_impl(np.int32(num_cuda_kernels), col.gpudata, np.int32(input_cols), np.int32(num_input_maps), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(out_deltas.shape[1]), np.int32(out_deltas.shape[2]), in_deltas[i], block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(num_cuda_kernels), 1)) def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(num_cuda_kernels), 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs) def dot_add_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.add_dot(a, b, out, transa, transb) def dot_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.dot(a, b, transa=transa, transb=transb, out=out) def divide_mv(self, m, v, out): cumisc.div_matvec(m, v, out=out) def divide_tt(self, a, b, out): div_kernel(a, b, out) def fill_gaussian(self, mean, std, out): self.rnd.fill_normal(out) self.mult_st(std, out, out=out) self.add_st(mean, out, out=out) def generate_probability_mask(self, mask, probability): self.rnd.fill_uniform(mask) create_probabilistic_mask_kernel(mask, probability, mask) def index_m_by_v(self, m, v, out): index_m_by_v_kernel(out, v, m, m.shape[0], m.shape[1]) def log_t(self, a, out): cumath.log(a, out=out) def maxpool2d_backward_batch(self, inputs, window, outputs, padding, stride, argmax, in_deltas, out_deltas): in_image_size = inputs.size // inputs.shape[0] out_image_size = outputs.size // outputs.shape[0] _maxpool_bwd_fp32_impl(np.int32(outputs.size), out_deltas, argmax, np.int32(out_image_size), np.int32(in_image_size), in_deltas, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(outputs.size), 1)) def maxpool2d_forward_batch(self, inputs, window, outputs, padding, stride, argmax): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _maxpool_fwd_fp32_impl(np.int32(outputs.size), inputs, np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), outputs, argmax, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(outputs.size), 1)) def merge_tt(self, a, b, out): assert(a.shape[-1] + b.shape[-1] == out.shape[-1]) n = int(np.prod(out.shape[:-1])) grid, block = self._get_gridsize(n) _merge_impl(a.gpudata, b.gpudata, out.gpudata, np.int32(n), np.int32(a.shape[-1]), np.int32(b.shape[-1]), block=block, grid=grid) def modulo_tt(self, a, b, out): modulo_tt_kernel(a, b, out) def mult_add_st(self, s, t, out): mult_add_st_kernel(s, t, out) def mult_add_tt(self, a, b, out): mult_add_kernel(a, b, out) def mult_mv(self, m, v, out): if m.shape == v.shape: self.mult_tt(m, v, out=out) else: cumisc.mult_matvec(m, v, out=out) def mult_add_mv(self, m, v, out): if m.shape == v.shape: self.mult_add_tt(m, v, out=out) else: tmp = self.allocate(out.shape) cumisc.mult_matvec(m, v, out=tmp) self.add_tt(tmp, out, out=out) def mult_st(self, s, t, out): mult_st_kernel(s, t, out) def mult_tt(self, a, b, out): mult_tt_kernel(a, b, out) def sign_t(self, a, out): sign_kernel(a, out) def split_add_tt(self, x, out_a, out_b): assert(out_a.shape[-1] + out_b.shape[-1] == x.shape[-1]) n = int(np.prod(x.shape[:-1])) grid, block = self._get_gridsize(n) _split_add_impl(x.gpudata, out_a.gpudata, out_b.gpudata, np.int32(n), np.int32(out_a.shape[-1]), np.int32(out_b.shape[-1]), block=block, grid=grid) def sqrt_t(self, a, out): cumath.sqrt(a, out=out) def subtract_mv(self, m, v, out): cumisc.binaryop_matvec('-', m, v, None, out, None) def subtract_tt(self, a, b, out): subtract_mm_kernel(a, b, out) def sum_t(self, a, axis, out): if len(a.shape) < 3 and (axis == 0 or axis == 1): cumisc.sum(a, axis=axis, out=out) elif axis is None: cumisc.sum(a.reshape((a.size, 1)), axis=0, out=out) else: raise NotImplementedError # ------------------------ Activation functions ------------------------- # def rel(self, x, y): rel_kernel(x, y) def rel_deriv(self, x, y, dy, dx): rel_deriv_kernel(x, y, dy, dx) def sigmoid(self, x, y): sigmoid_kernel(x, y) def sigmoid_deriv(self, x, y, dy, dx): sigmoid_deriv_kernel(x, y, dy, dx) def softmax_m(self, m, out): n, k = m.shape tmp = gpuarray.empty((1, n), dtype=m.dtype) _softmax_impl(m, tmp.gpudata, out, np.int32(n), np.int32(k), block=(32, 1, 1), grid=(n, 1, 1)) return out def tanh(self, x, y): tanh_kernel(x, y) def tanh_deriv(self, x, y, dy, dx): tanh_deriv_kernel(x, y, dy, dx)
class PyCudaHandler(Handler): __undescribed__ = {'context', 'dtype', 'EMPTY', 'rnd'} def __init__(self, seed=None): super(PyCudaHandler, self).__init__() self.dtype = np.float32 self.context = cumisc._global_cublas_handle self.EMPTY = gpuarray.zeros((), dtype=self.dtype) if seed is None: seed = global_rnd.generate_seed() def get_seeds(n): return gpuarray.to_gpu(np.ones(n, np.int32) * seed) self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds) array_type = pycuda.gpuarray.GPUArray def __init_from_description__(self, description): self.__init__() def _get_gridsize(self, n): min_threads = 32 max_threads = 256 max_blocks = 384 if n < min_threads: block_count = 1 threads_per_block = min_threads elif n < (max_blocks * min_threads): block_count = (n + min_threads - 1) // min_threads threads_per_block = min_threads elif n < (max_blocks * max_threads): block_count = max_blocks grp = (n + min_threads - 1) // min_threads threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads else: block_count = max_blocks threads_per_block = max_threads return (block_count, 1), (threads_per_block, 1, 1) # ------------------------- Allocate new memory ------------------------- # def allocate(self, size): return gpuarray.zeros(size, dtype=self.dtype) def ones(self, shape): a = self.zeros(shape) self.fill(a, 1.0) return a def zeros(self, shape): return gpuarray.zeros(shape=shape, dtype=self.dtype) # ---------------------------- Copy and Fill ---------------------------- # def copy_to(self, src, dest): # Copy data from src to dest (both must be GPUArrays) pycuda.driver.memcpy_dtod(dest.gpudata, src.gpudata, dest.nbytes) def copy_to_if(self, src, dest, cond): copy_to_if_kernel(src, dest, cond) def create_from_numpy(self, arr): return gpuarray.to_gpu(arr.astype(self.dtype)) def fill(self, mem, val): mem.fill(val) def fill_if(self, mem, val, cond): fill_if_kernel(mem, val, cond) def get_numpy_copy(self, mem): assert type(mem) == self.array_type return mem.get() def set_from_numpy(self, mem, arr): assert mem.shape == arr.shape, "Shape of destination ({}) != Shape " \ "of source ({})".format(mem.shape, arr.shape) mem.set(arr.astype(self.dtype)) # ---------------------------- Debug helpers ---------------------------- # def is_fully_finite(self, a): temp = gpuarray.zeros_like(a) check_inf_or_nan_kernel(a, temp) return not np.any(temp.get()) # ----------------------- Mathematical operations ----------------------- # def abs_t(self, a, out): cumath.fabs(a, out=out) def add_into_if(self, a, out, cond): add_into_if_kernel(a, out, cond) def add_mv(self, m, v, out): cumisc.add_matvec(m, v, out=out) def add_st(self, s, t, out): add_st_kernel(s, t, out) def add_tt(self, a, b, out): add_mm_kernel(a, b, out) def avgpool2d_backward_batch(self, inputs, window, outputs, padding, stride, in_deltas, out_deltas): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _avepool_bwd_fp32_impl(np.int32(inputs.size), out_deltas, np.int32(n), np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), in_deltas, block=(get_blocks(inputs.size), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) def avgpool2d_forward_batch(self, inputs, window, outputs, padding, stride): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _avepool_fwd_fp32_impl(np.int32(outputs.size), inputs, np.int32(n), np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), outputs, block=(get_blocks(outputs.size), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) def binarize_v(self, v, out): binarize_v_kernel(out, v, out.shape[0], out.shape[1]) def broadcast_t(self, a, axis, out): broadcast_dim = int(out.shape[axis]) stride = int(np.prod(out.shape[axis+1:])) broadcast_t_kernel(out, a, broadcast_dim, stride) def clip_t(self, a, a_min, a_max, out): clip_kernel(a, out, a_min, a_max) def conv2d_backward_batch(self, inputs, params, padding, stride, in_deltas, out_deltas, dparams, dbias): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = out_deltas.shape[1] * out_deltas.shape[2] num_kernel_params = np.prod(kernel_shape) dparams.fill(0.0) dbias.fill(0.0) tmp = self.zeros(dbias.shape) col = self.zeros((num_output_pixels, num_kernel_params)) for i in range(num_images): num_cuda_kernels = num_output_pixels * num_input_maps _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(out_deltas.shape[2]), np.int32(num_input_maps), col.gpudata, block=(get_blocks(num_cuda_kernels), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) # Compute gradients reshaped_dparams = dparams.reshape(num_filters, num_kernel_params) reshaped_out_deltas = out_deltas[i].reshape((num_output_pixels, num_filters)) self.dot_add_mm(reshaped_out_deltas, col, out=reshaped_dparams, transa=True) self.sum_t(reshaped_out_deltas, axis=0, out=tmp) self.add_tt(tmp, dbias, out=dbias) # Compute in_deltas reshaped_params = params.reshape((num_filters, num_kernel_params)) self.dot_mm(reshaped_out_deltas, reshaped_params, out=col) num_cuda_kernels = input_rows * input_cols * num_input_maps _col2im_fp32_impl(np.int32(num_cuda_kernels), col.gpudata, np.int32(input_cols), np.int32(num_input_maps), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(out_deltas.shape[1]), np.int32(out_deltas.shape[2]), in_deltas[i], block=(get_blocks(num_cuda_kernels), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(get_blocks(num_cuda_kernels), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs) def dot_add_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.add_dot(a, b, out, transa, transb) def dot_mm(self, a, b, out, transa=False, transb=False): transa = 'T' if transa else 'N' transb = 'T' if transb else 'N' culinalg.dot(a, b, transa=transa, transb=transb, out=out) def divide_mv(self, m, v, out): cumisc.div_matvec(m, v, out=out) def divide_tt(self, a, b, out): div_kernel(a, b, out) def fill_gaussian(self, mean, std, out): self.rnd.fill_normal(out) self.mult_st(std, out, out=out) self.add_st(mean, out, out=out) def generate_probability_mask(self, mask, probability): self.rnd.fill_uniform(mask) create_probabilistic_mask_kernel(mask, probability, mask) def index_m_by_v(self, m, v, out): index_m_by_v_kernel(out, v, m, m.shape[0], m.shape[1]) def log_t(self, a, out): cumath.log(a, out=out) def maxpool2d_backward_batch(self, inputs, window, outputs, padding, stride, argmax, in_deltas, out_deltas): in_image_size = inputs.size // inputs.shape[0] out_image_size = outputs.size // outputs.shape[0] _maxpool_bwd_fp32_impl(np.int32(outputs.size), out_deltas, argmax, np.int32(out_image_size), np.int32(in_image_size), in_deltas, block=(get_blocks(outputs.size), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) def maxpool2d_forward_batch(self, inputs, window, outputs, padding, stride, argmax): n, h, w, c = inputs.shape o_h, o_w = outputs.shape[1], outputs.shape[2] _maxpool_fwd_fp32_impl(np.int32(outputs.size), inputs, np.int32(h), np.int32(w), np.int32(c), np.int32(o_h), np.int32(o_w), np.int32(window[0]), np.int32(window[1]), np.int32(stride[0]), np.int32(stride[1]), np.int32(padding), np.int32(padding), outputs, argmax, block=(get_blocks(outputs.size), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) def merge_tt(self, a, b, out): assert(a.shape[-1] + b.shape[-1] == out.shape[-1]) n = int(np.prod(out.shape[:-1])) grid, block = self._get_gridsize(n) _merge_impl(a.gpudata, b.gpudata, out.gpudata, np.int32(n), np.int32(a.shape[-1]), np.int32(b.shape[-1]), block=block, grid=grid) def modulo_tt(self, a, b, out): modulo_tt_kernel(a, b, out) def mult_add_st(self, s, t, out): mult_add_st_kernel(s, t, out) def mult_add_tt(self, a, b, out): mult_add_kernel(a, b, out) def mult_mv(self, m, v, out): if m.shape == v.shape: self.mult_tt(m, v, out=out) else: cumisc.mult_matvec(m, v, out=out) def mult_add_mv(self, m, v, out): if m.shape == v.shape: self.mult_add_tt(m, v, out=out) else: tmp = self.allocate(out.shape) cumisc.mult_matvec(m, v, out=tmp) self.add_tt(tmp, out, out=out) def mult_st(self, s, t, out): mult_st_kernel(s, t, out) def mult_tt(self, a, b, out): mult_tt_kernel(a, b, out) def sign_t(self, a, out): sign_kernel(a, out) def split_add_tt(self, x, out_a, out_b): assert(out_a.shape[-1] + out_b.shape[-1] == x.shape[-1]) n = int(np.prod(x.shape[:-1])) grid, block = self._get_gridsize(n) _split_add_impl(x.gpudata, out_a.gpudata, out_b.gpudata, np.int32(n), np.int32(out_a.shape[-1]), np.int32(out_b.shape[-1]), block=block, grid=grid) def sqrt_t(self, a, out): cumath.sqrt(a, out) def subtract_mv(self, m, v, out): cumisc.binaryop_matvec('-', m, v, None, out, None) def subtract_tt(self, a, b, out): subtract_mm_kernel(a, b, out) def sum_t(self, a, axis, out): if len(a.shape) < 3 and (axis == 0 or axis == 1): cumisc.sum(a, axis, out) elif axis is None: self.copy_to(cumisc.sum(a), out) else: raise NotImplementedError # ------------------------ Activation functions ------------------------- # def rel(self, x, y): rel_kernel(x, y) def rel_deriv(self, x, y, dy, dx): rel_deriv_kernel(x, y, dy, dx) def sigmoid(self, x, y): sigmoid_kernel(x, y) def sigmoid_deriv(self, x, y, dy, dx): sigmoid_deriv_kernel(x, y, dy, dx) def softmax_m(self, m, out): n, k = m.shape tmp = gpuarray.empty((1, n), dtype=m.dtype) _softmax_impl(m, tmp.gpudata, out, np.int32(n), np.int32(k), block=(32, 1, 1), grid=(n, 1, 1)) return out def tanh(self, x, y): tanh_kernel(x, y) def tanh_deriv(self, x, y, dy, dx): tanh_deriv_kernel(x, y, dy, dx)