def repulsion(first_permutation: List[int], second_permutation: List[int], random_states): dimension = len(first_permutation) thread_id = cuda.threadIdx.x distance = hamming_distance(first_permutation, second_permutation) new_distance = 0 iterations = 0 while distance >= new_distance: iterations += 1 first_index = int( xoroshiro128p_uniform_float64(random_states, thread_id) * dimension) second_index = int( xoroshiro128p_uniform_float64(random_states, thread_id) * dimension) tmp = second_permutation[first_index] second_permutation[first_index] = second_permutation[second_index] second_permutation[second_index] = tmp new_distance = hamming_distance(first_permutation, second_permutation) if iterations == dimension: break
def rejection_resample(state_array, state_buffer, weight, rng, weight_max_array): ''' General resample function for particle filter. In paper : Parallel Resampling in the Particle Filter (Journal of Computational and Graphical Statistics) :param state_array: :param state_buffer: buffer array, its size same to state_array :param weight: :param rng:random generator(pyculib) :param weight_max_array: buffer , its size same to weight. :return: ''' pos = cuda.grid(1) tid = cuda.threadIdx.x sdata = cuda.shared.array(shape=(1024), dtype=float64) if pos < state_array.shape[1]: if pos == 0: weight_max_array[0] = 0.0 # state_buffer[:, pos] = state_array[:, pos] for i in range(state_buffer.shape[0]): state_buffer[i, pos] = state_array[i, pos] sdata[tid] = weight[pos] s = cuda.blockDim.x >> 1 while s > 0: if tid < s: sdata[tid] = max(sdata[tid], sdata[tid + s]) s = s >> 1 cuda.syncthreads() if tid == 0: cuda.atomic.max(weight_max_array, 0, sdata[0]) j = pos u = xoroshiro128p_uniform_float64(rng, pos) counter = 0 while u > weight[j] / weight_max_array[0] and counter < 1000: counter = counter + 1 j = int( math.ceil( xoroshiro128p_uniform_float64(rng, pos) * state_array.shape[1])) u = xoroshiro128p_uniform_float64(rng, pos) # state_array[:, pos] = state_buffer[:, j] for i in range(state_buffer.shape[0]): state_array[i, pos] = state_buffer[i, j] weight[pos] = 1.0 / float64(state_array.shape[1]) cuda.syncthreads()
def cuda_step_stick(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, orientation): """Kernel function for 1D diffusion""" # Global thread index on a 1D grid thread_id = cuda.grid(1) if thread_id >= n_of_spins: return # Allocate local memory step = cuda.local.array(3, numba.double) # Generate random step if xoroshiro128p_uniform_float64(rng_states, thread_id) > .5: step[0] = orientation[0] * step_length step[1] = orientation[1] * step_length step[2] = orientation[2] * step_length else: step[0] = -orientation[0] * step_length step[1] = -orientation[1] * step_length step[2] = -orientation[2] * step_length # Update positions positions[0, thread_id] = positions[0, thread_id] + step[0] positions[1, thread_id] = positions[1, thread_id] + step[1] positions[2, thread_id] = positions[2, thread_id] + step[2] # Calculate phase shift for measurement in range(g_x.shape[1]): phases[measurement, thread_id] += gamma * dt * \ (g_x[time_point, measurement] * positions[0, thread_id] + \ g_y[time_point, measurement] * positions[1, thread_id] + \ g_z[time_point, measurement] * positions[2, thread_id])
def cuda_step_plane(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, directions): """Kernel function for 2D diffusion""" # Global thread index on a 1D grid thread_id = cuda.grid(1) if thread_id >= n_of_spins: return # Allocate local memory step = cuda.local.array(3, numba.double) # Generate random step phi = xoroshiro128p_uniform_float64(rng_states, thread_id) * 6.283185307179586 step[0] = math.cos(phi) * directions[0] + math.sin(phi) * directions[3] step[1] = math.cos(phi) * directions[1] + math.sin(phi) * directions[4] step[2] = math.cos(phi) * directions[2] + math.sin(phi) * directions[5] step[0] = step_length * step[0] step[1] = step_length * step[1] step[2] = step_length * step[2] # Update positions positions[0, thread_id] = positions[0, thread_id] + step[0] positions[1, thread_id] = positions[1, thread_id] + step[1] positions[2, thread_id] = positions[2, thread_id] + step[2] # Calculate phase shift for measurement in range(g_x.shape[1]): phases[measurement, thread_id] += gamma * dt * \ (g_x[time_point, measurement] * positions[0, thread_id] + \ g_y[time_point, measurement] * positions[1, thread_id] + \ g_z[time_point, measurement] * positions[2, thread_id])
def integration_kernel(MCresult, domain, parameters, domain_range, total_size, batch_size, i_batch, rng_states, num_points, parameter_shape,parameter_off_set): thread_id = cuda.grid(1) if thread_id < batch_size: parameter_id = thread_id + i_batch * batch_size if parameter_id < total_size: # local array to save current parameter grid value aa = cuda.local.array(shape=num_parameters, dtype=nb.int32) for i in range(num_parameters): aa[i] = 0 unravel(num_parameters,parameter_shape,parameter_id,aa) # turn aa into one-dimensional for i in range(num_parameters-1): aa[i+1] = aa[i+1]+parameter_off_set[i] # feed in parameter values to aa for i in range(num_parameters): aa[i] = parameters[aa[i]] for i_sample in range(num_points): x_tuple = cuda.local.array(shape=dim, dtype=nb.float64) for j_dim in range(dim): x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id) * domain_range[j_dim] + domain[j_dim][0] # feed in values to user defined function, # and add all points' corresponding results in one chunk cuda.atomic.add(MCresult, thread_id, fun(x_tuple, aa))
def integration_kernel(MCresult, num_points_in_one_chunk, num_chunks_in_one_dimension, domain, domain_range, batch_size, i_batch, rng_states, num_chunks): thread_id = cuda.grid(1) if thread_id < batch_size: chunk_id = thread_id + i_batch * batch_size if chunk_id < num_chunks: # local digits index for each thread digit_store = cuda.local.array(shape=dim, dtype=nb.int64) for i_temp in range(dim): digit_store[i_temp] = 0 # convert one_dim index to n_dim index # result will be stored in digit_store oneD_to_nD(num_chunks_in_one_dimension, chunk_id, digit_store) # specify the local domain domain_left = cuda.local.array(shape=dim, dtype=nb.float64) for j_dim in range(dim): domain_left[j_dim] = domain[j_dim][0] + digit_store[j_dim] * domain_range[j_dim] for i_sample in range(num_points_in_one_chunk): # x_tuple: local axis values for each thread x_tuple = cuda.local.array(shape=dim, dtype=nb.float64) for j_dim in range(dim): x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id) * domain_range[j_dim] + domain_left[j_dim] # feed in values to user defined function, # and add all points' corresponding results in one chunk cuda.atomic.add(MCresult, thread_id, fun(x_tuple))
def integration_kernel(num_loops, MCresult, chunk_size, n_chunk_x, domain, domain_range, batch_size, i_batch, rng_states, n_chunk): thread_id = cuda.grid(1) if thread_id < batch_size: chunk_id = thread_id + i_batch * batch_size if chunk_id < n_chunk: # digit_store: local digits index for each thread digit_store = cuda.local.array(shape=dim, dtype=nb.int64) for i_temp in range(dim): digit_store[i_temp] = 0 # convert one_d index to dim_d index # result will be stored in digit_store oneD_to_nD(n_chunk_x,chunk_id,digit_store) # specisify the local domain domain_left = cuda.local.array(dim, dtype=nb.float64) for j_dim in range(dim): domain_left[j_dim] = domain[j_dim][0] + digit_store[j_dim] * domain_range[j_dim] for i_sample in range(chunk_size): # x_tuple: local axis values for each thread x_tuple = cuda.local.array(dim, dtype=nb.float64) for j_dim in range(dim): x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id) *domain_range[j_dim] + domain_left[j_dim] # feed in values to user defined function cuda.atomic.add(MCresult, thread_id, fun(x_tuple))
def em_discrete(previous_permutations: List[List[int]], next_permutations: List[List[int]], qap_values: List[int], weights: List[List[int]], distances: List[List[int]], max_hamming_distance: int, random_states, pmx_buffer): thread_id = cuda.threadIdx.x if thread_id < len(previous_permutations): dimension = len(previous_permutations[thread_id]) qap_values[thread_id] = qap_device(previous_permutations[thread_id], weights, distances) cuda.syncthreads() best_value_index = 0 for i in range(len(previous_permutations)): if qap_values[best_value_index] > qap_values[i]: best_value_index = i cuda.syncthreads() # copy current permutation to next permutation for i in range(dimension): next_permutations[thread_id][i] = previous_permutations[thread_id][i] if thread_id == best_value_index: return # search surroundings for i in range(len(previous_permutations)): if i == thread_id: continue if hamming_distance(previous_permutations[thread_id], previous_permutations[thread_id]) < max_hamming_distance: if qap_values[thread_id] > qap_values[i]: first_bound = int(xoroshiro128p_uniform_float64(random_states, thread_id)) second_bound = int(xoroshiro128p_uniform_float64(random_states, thread_id)) lower_bound = min(first_bound, second_bound) upper_bound = max(first_bound, second_bound) pmx(previous_permutations[i], next_permutations[thread_id], lower_bound, upper_bound, pmx_buffer[thread_id]) for j in range(dimension): next_permutations[thread_id][j] = pmx_buffer[thread_id][j] else: repulsion(previous_permutations[i], next_permutations[thread_id], random_states)
def rng_kernel_float64(states, out, count, distribution): thread_id = cuda.grid(1) for i in range(count): if distribution == UNIFORM: out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id) elif distribution == NORMAL: out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)
def random_flip(spins, shape_shifts, rng_states): tindex = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x index = 0 num_dim = shape_shifts.size for a in range(num_dim - 1, -1, -1): index <<= shape_shifts[a] index += np.int64( ncrand.xoroshiro128p_uniform_float64(rng_states, tindex) * (1 << shape_shifts[a])) spins[index >> 3] ^= 1 << (index & 7)
def determine_scatterings_cuda(N_batch, batch_size, elec_Ntot, nscatter_per_elec, nscatter_per_batch, random_states, dt, elec_ux, elec_uy, elec_uz, elec_inv_gamma, ratio_w_electron_photon, photon_n, photon_p, photon_beta_x, photon_beta_y, photon_beta_z): """ For each electron macroparticle, decide how many photon macroparticles it will emit during `dt`, using the integrated Klein-Nishina formula. Electrons are processed in batches of size `batch_size`, with a parallel loop over batches. The batching allows quicker calculation of the total number of photons to be created. """ # Loop over batches of particles i_batch = cuda.grid(1) if i_batch < N_batch: # Set the count of scattered particles in the batch to 0 nscatter_per_batch[i_batch] = 0 # Loop through the batch # (Note: a while loop is used here, because numba 0.34 does # not support nested prange and range loops) N_max = min((i_batch + 1) * batch_size, elec_Ntot) ip = i_batch * batch_size while ip < N_max: # Set the count of scattered photons for this electron to 0 nscatter_per_elec[ip] = 0 # For each electron, calculate the probability of scattering p = get_scattering_probability(dt, elec_ux[ip], elec_uy[ip], elec_uz[ip], elec_inv_gamma[ip], photon_n[ip], photon_p, photon_beta_x, photon_beta_y, photon_beta_z) # Determine the number of photons produced by this electron r = xoroshiro128p_uniform_float64(random_states, i_batch) nscatter = int(p * ratio_w_electron_photon + r) # Note: if p is 0, the above formula will return nscatter=0 # since r is in [0, 1). Similarly, if p is very small, # nscatter will be 1 with probabiliy p * ratio_w_electron_photon, # and 0 otherwise. nscatter_per_elec[ip] = nscatter nscatter_per_batch[i_batch] += nscatter # Increment ip ip = ip + 1
def weight_mutation(rng_states, out, mutation_rate): #find the position of element to operate on x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y tx = cuda.threadIdx.x ty = cuda.threadIdx.y if x >= out.shape[0] and y >= out.shape[1]: # Quit if (x, y) is outside of valid C boundary return #generate the random number. rand = xoroshiro128p_uniform_float64(rng_states, x * out.shape[1] + y) if (rand < mutation_rate): out[x][y] = out[x][y] + rand - 0.5
def biase_crossover(rng_states, out, mutation_rate, mother): #find the position of element to operate on x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y tx = cuda.threadIdx.x ty = cuda.threadIdx.y if x >= out.shape[0] and y >= out.shape[1]: return #generate random number rand = xoroshiro128p_uniform_float64(rng_states, x * out.shape[1] + y) #temp[x]=rand #condition for crossover if (rand < mutation_rate): out[x][y] = mother[x][y]
def integration_kernel(MCresult, domain, domain_range, rng_states, num_points): thread_id = cuda.grid(1) if thread_id < num_points: # local array to save random numbers x_tuple = cuda.local.array(shape=dim, dtype=nb.float64) for j_dim in range(dim): x_tuple[j_dim] = xoroshiro128p_uniform_float64( rng_states, thread_id) * domain_range[j_dim] + domain[j_dim][0] # accumulate the sampled results on global memory cuda.atomic.add(MCresult, 0, fun(x_tuple))
def metropolis_step(spins, shape_shifts, temperature, field, coupling_indices, coupling_constants, block_shifts, offsets, rng_states): """ Multi-spin metropolis algorthm :param spins: spin configuration; stored as np.uint8 bytes :param shape_shifts: shape of lattice, as power of 2 :param temperature: unitless temperature :param field: unitless applied field :param coupling_indices: :param coupling_constants: :param block_shifts: shape of subsubdivisions, as power of 2 :param offsets: offsets for subdivisions :param rng_states: numba.cuda.random rng states :return: """ thread_index = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x temp_index = thread_index num_dim = shape_shifts.size shift = 0 spin_index = 0 for a in range(num_dim - 1, -1, -1): spin_index <<= shape_shifts[a] spin_index += ((temp_index & ((1 << (shape_shifts[a] - block_shifts[a] - 1)) - 1)) << (block_shifts[a] + 1)) spin_index += np.int64(offsets[a]) << block_shifts[a] spin_index += rng_states[thread_index]["s0"] & ( (1 << block_shifts[a]) - 1) ncrand.xoroshiro128p_next(rng_states, thread_index) temp_index >>= shape_shifts[a] - block_shifts[a] - 1 delta_E = -2 * calc_single_interaction_energy( spin_index, spins, shape_shifts, coupling_indices, coupling_constants) this_spin = (spins[spin_index >> 3] >> (spin_index & 7)) & 1 delta_E -= 2 * field * this_spin if (ncrand.xoroshiro128p_uniform_float64(rng_states, thread_index) < math.exp(-delta_E / temperature)): spins[spin_index >> 3] ^= 1 << (spin_index & 7)
def fill_uniformly_cuda(positions, triangles, max, rng_states): """Cuda kernel function for calculating spin positions inside the triangular mesh.""" thread_id = cuda.grid(1) if thread_id >= positions.shape[1]: return inside = False while not inside: intersections = 0 r0 = cuda.local.array(3, numba.double) unit_step = cuda.local.array(3, numba.double) r0[0] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[0] r0[1] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[1] r0[2] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[2] unit_step[0] = xoroshiro128p_uniform_float64(rng_states, thread_id) - .5 unit_step[1] = xoroshiro128p_uniform_float64(rng_states, thread_id) - .5 unit_step[2] = xoroshiro128p_uniform_float64(rng_states, thread_id) - .5 normalizing_factor = math.sqrt(unit_step[0]**2 + unit_step[1]**2 + unit_step[2]**2) unit_step[0] = unit_step[0] / normalizing_factor unit_step[1] = unit_step[1] / normalizing_factor unit_step[2] = unit_step[2] / normalizing_factor for triangle_idx in range(0, len(triangles), 9): A = triangles[triangle_idx:triangle_idx + 3] B = triangles[triangle_idx + 3:triangle_idx + 6] C = triangles[triangle_idx + 6:triangle_idx + 9] t = simulation.triangle_intersection_check(A, B, C, r0, unit_step) if t > 0: intersections = intersections + 1 if intersections % 2 != 0: inside = True positions[0, thread_id] = r0[0] positions[1, thread_id] = r0[1] positions[2, thread_id] = r0[2]
def mcmc_bench(X, Y, output, rng_states, n_iter): """Device code of our parallel MCMC implementation. """ shared = cuda.shared.array(shape=(2**9,), dtype=float64) # Shared Memory tx = cuda.threadIdx.x # Thread ID ty = cuda.blockIdx.x # Block ID bw = cuda.blockDim.x # Block Size idx = bw*ty+tx # Global ID alpha, beta0, beta1, sigma = 0, 0, 0, 1 x = X[idx] # Fetch the data point y = Y[idx] mu = alpha + beta0*x[0] + beta1*x[1] logp_xy = -((y-mu)**2)/(2*(sigma**2)) - math.log(sigma) # Log-likelihood of the data point shared[tx] = logp_xy # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood of the sub-dataset from the first position logp = shared[0] # NOTE: Might cause some performance issues # Add the log-prior log_prior = - ((alpha**2)/(2*(10**2)) + (beta0**2)/(2*(10**2)) + (beta1**2)/(2*(10**2)) + (sigma**2)/2) logp += log_prior # Main MCMC Loop for i in range(n_iter): # Propose a new theta alpha_ = alpha + 0.1*xoroshiro128p_normal_float64(rng_states, idx) beta0_ = beta0 + 0.1*xoroshiro128p_normal_float64(rng_states, idx) beta1_ = beta1 + 0.1*xoroshiro128p_normal_float64(rng_states, idx) sigma_ = sigma + 0.1*xoroshiro128p_normal_float64(rng_states, idx) mu = alpha_ + beta0_*x[0] + beta1_*x[1] logp_xy = -((y-mu)**2)/(2*(sigma_**2)) - math.log(sigma_) shared[tx] = logp_xy # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood; # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/ logp_ = shared[0] # Add the log-prior log_prior = - ((alpha_**2)/(2*(10**2)) + (beta0_**2)/(2*(10**2)) + (beta1_**2)/(2*(10**2)) + (sigma_**2)/2) logp_ += log_prior # Acceptance ratio gamma = math.exp(min(0,logp_-logp)) # Draw a uniform random number u = xoroshiro128p_uniform_float64(rng_states, idx) # Accept/Reject? if u < gamma: alpha = alpha_ beta0 = beta0_ beta1 = beta1_ sigma = sigma_ logp = logp_ # Write the sample to the memory if tx == 0: output[i,ty,0] = alpha output[i,ty,1] = beta0 output[i,ty,2] = beta1 output[i,ty,3] = sigma
def update(ST, rng_states, _obj_i): ST['spike'] = random.xoroshiro128p_uniform_float64(rng_states, _obj_i) < freqs * dt
def mcmc(data, output, rng_states, n_iter): """Device code of our parallel MCMC implementation. """ shared = cuda.shared.array(shape=(2**9,), dtype=float64) # Shared Memory tx = cuda.threadIdx.x # Thread ID ty = cuda.blockIdx.x # Block ID bw = cuda.blockDim.x # Block Size idx = bw*ty+tx # Global ID theta = (0.,0.) # Initialize theta x = data[idx] # Fetch the data point logp_x = -(((theta[0]-x[0])**2)/(2*0.1) + ((theta[1]-x[1])**2)/(2*0.1)) # Log-likelihood of the data point shared[tx] = logp_x # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood of the sub-dataset from the first position logp = shared[0] # NOTE: Might cause some performance issues # Add the log-prior log_prior = -(((theta[0]-1)**2)/2 + ((theta[1]-1)**2)/2) logp += log_prior/2 # Main MCMC Loop for i in range(n_iter): # Propose a new theta theta_ = (theta[0] + 0.1*xoroshiro128p_normal_float64(rng_states, idx), theta[1] + 0.1*xoroshiro128p_normal_float64(rng_states, idx)) logp_x = -(((theta_[0]-x[0])**2)/(2*0.1) + ((theta_[1]-x[1])**2)/(2*0.1)) # Log-likelihood of the data point shared[tx] = logp_x # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood; # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/ logp_ = shared[0] # Add the log-prior log_prior = -(((theta_[0]-1)**2)/2 + ((theta_[1]-1)**2)/2) logp_ += log_prior/2 # Acceptance ratio alpha = math.exp(min(0,logp_-logp)) # Draw a uniform random number u = xoroshiro128p_uniform_float64(rng_states, idx) # Accept/Reject? if u < alpha: theta = theta_ logp = logp_ # Write the sample to the memory if tx == 0: output[i,ty] = theta
def _cuda_fill_mesh( points, rng_states, intra, vertices, faces, voxel_size, triangle_indices, subvoxel_indices, xs, ys, zs, n_sv, ): """Kernel function for efficiently sampling points from a uniform distribution inside or outside the surface defined by the triangular mesh.""" thread_id = cuda.grid(1) if thread_id >= points.shape[0] or points[thread_id, 0] != math.inf: return point = cuda.local.array(3, numba.float64) for i in range(3): point[i] = xoroshiro128p_uniform_float64(rng_states, thread_id) * voxel_size[i] ray = cuda.local.array(3, numba.float64) ray[0] = 1.0 ray[1] = 0.0 ray[2] = 0.0 # Find the subvoxels the ray intersects lls = cuda.local.array(3, numba.int64) uls = cuda.local.array(3, numba.int64) lls[0] = _ll_subvoxel_overlap(xs, point[0], point[0] + ray[0]) lls[1] = _ll_subvoxel_overlap(ys, point[1], point[1] + ray[1]) lls[2] = _ll_subvoxel_overlap(zs, point[2], point[2] + ray[2]) uls[0] = _ul_subvoxel_overlap(xs, point[0], point[0] + ray[0]) uls[1] = _ul_subvoxel_overlap(ys, point[1], point[1] + ray[1]) uls[2] = _ul_subvoxel_overlap(zs, point[2], point[2] + ray[2]) # Keep track of the number of intersections and the triangles. The max # number of intersections allowed is 1000. Increase this number for very # complex meshes. n_intersections = 0 triangle = cuda.local.array((3, 3), numba.float64) triangles = cuda.local.array(1000, numba.int64) # Loop over the subvoxels for x in range(lls[0], uls[0]): for y in range(lls[1], uls[1]): for z in range(lls[2], uls[2]): sv = int(x * n_sv[1] * n_sv[2] + y * n_sv[2] + z) # Loop over the triangles for i in range(subvoxel_indices[sv, 0], subvoxel_indices[sv, 1]): if n_intersections >= 1000: return _cuda_get_triangle(triangle_indices[i], vertices, faces, triangle) d = _cuda_ray_triangle_intersection_check( triangle, point, ray) if d > 0: already_intersected = False for j in triangles[0:n_intersections]: if j == triangle_indices[i]: already_intersected = True break if not already_intersected: triangles[n_intersections] = triangle_indices[i] n_intersections += 1 if intra: if n_intersections % 2 == 1: # Point is inside the surface for i in range(3): points[thread_id, i] = point[i] else: if n_intersections % 2 == 0: # Point is outside the surface for i in range(3): points[thread_id, i] = point[i] return
def rand(rng_states): return xoroshiro128p_uniform_float64(rng_states, cuda.grid(1))
def rhs_psi(ps, ph, U, ps_new, ph_new, U_new, zz, dpsi, intR, lT_tilde, t_cur, rng_states): # ps = psi, ph = phi i, j = cuda.grid(2) m, n = ps.shape # thread on interior points if 0 < i < m - 1 and 0 < j < n - 1: # ============================================================= # # 1. ANISOTROPIC DIFFUSION # # ============================================================= # these ps's are defined on cell centers psipjp = (ps[i + 1, j + 1] + ps[i + 0, j + 1] + ps[i + 0, j + 0] + ps[i + 1, j + 0]) * 0.25 psipjm = (ps[i + 1, j + 0] + ps[i + 0, j + 0] + ps[i + 0, j - 1] + ps[i + 1, j - 1]) * 0.25 psimjp = (ps[i + 0, j + 1] + ps[i - 1, j + 1] + ps[i - 1, j + 0] + ps[i + 0, j + 0]) * 0.25 psimjm = (ps[i + 0, j + 0] + ps[i - 1, j + 0] + ps[i - 1, j - 1] + ps[i + 0, j - 1]) * 0.25 phipjp = (ph[i + 1, j + 1] + ph[i + 0, j + 1] + ph[i + 0, j + 0] + ph[i + 1, j + 0]) * 0.25 phipjm = (ph[i + 1, j + 0] + ph[i + 0, j + 0] + ph[i + 0, j - 1] + ph[i + 1, j - 1]) * 0.25 phimjp = (ph[i + 0, j + 1] + ph[i - 1, j + 1] + ph[i - 1, j + 0] + ph[i + 0, j + 0]) * 0.25 phimjm = (ph[i + 0, j + 0] + ph[i - 1, j + 0] + ph[i - 1, j - 1] + ph[i + 0, j - 1]) * 0.25 # ============================ # right edge flux # ============================ psx = ps[i + 1, j + 0] - ps[i + 0, j + 0] psz = psipjp - psipjm phx = ph[i + 1, j + 0] - ph[i + 0, j + 0] phz = phipjp - phipjm A = atheta(phx, phz) Ap = aptheta(phx, phz) JR = A * (A * psx - Ap * psz) # ============================ # left edge flux # ============================ psx = ps[i + 0, j + 0] - ps[i - 1, j + 0] psz = psimjp - psimjm phx = ph[i + 0, j + 0] - ph[i - 1, j + 0] phz = phimjp - phimjm A = atheta(phx, phz) Ap = aptheta(phx, phz) JL = A * (A * psx - Ap * psz) # ============================ # top edge flux # ============================ psx = psipjp - psimjp psz = ps[i + 0, j + 1] - ps[i + 0, j + 0] phx = phipjp - phimjp phz = ph[i + 0, j + 1] - ph[i + 0, j + 0] A = atheta(phx, phz) Ap = aptheta(phx, phz) JT = A * (A * psz + Ap * psx) # ============================ # bottom edge flux # ============================ psx = psipjm - psimjm psz = ps[i + 0, j + 0] - ps[i + 0, j - 1] phx = phipjm - phimjm phz = ph[i + 0, j + 0] - ph[i + 0, j - 1] A = atheta(phx, phz) Ap = aptheta(phx, phz) JB = A * (A * psz + Ap * psx) # ============================================================= # # 2. EXTRA TERM: sqrt2 * atheta**2 * phi * |grad psi|^2 # # ============================================================= # d(phi)/dx d(psi)/dx d(phi)/dz d(psi)/dz at nodes (i,j) phxn = (ph[i + 1, j + 0] - ph[i - 1, j + 0]) * 0.5 phzn = (ph[i + 0, j + 1] - ph[i + 0, j - 1]) * 0.5 psxn = (ps[i + 1, j + 0] - ps[i - 1, j + 0]) * 0.5 pszn = (ps[i + 0, j + 1] - ps[i + 0, j - 1]) * 0.5 A2 = atheta(phxn, phzn)**2 gradps2 = (psxn)**2 + (pszn)**2 extra = -sqrt2 * A2 * ph[i, j] * gradps2 # ============================================================= # # 3. double well (transformed): sqrt2 * phi + nonlinear terms # # ============================================================= # print(lT_tilde) # Up = (zz[i,j] - R_tilde * (nt*dt) )/lT_tilde # Up = (zz[i,j]-z0 - R_tilde * (nt*dt) )/lT_tilde Up = (zz[j] - intR) / lT_tilde rhs_psi = ((JR-JL) + (JT-JB) + extra) * hi**2 + \ sqrt2*ph[i,j] - lamd*(1-ph[i,j]**2)*sqrt2*(U[i,j] + Up) # ============================================================= # # 4. dpsi/dt term # # ============================================================= tp = (1 - (1 - k) * Up) tau_psi = tp * A2 if tp >= k else k * A2 dpsi[i, j] = rhs_psi / tau_psi # + eta*(random()-0.5)/dt_sr #x = xoroshiro128p_uniform_float64(rng_states, thread_id) threadID = j * m + i beta_ij = xoroshiro128p_uniform_float64( rng_states, threadID) - 0.5 # rand from [-0.5, 0.5] # update psi and phi ps_new[i, j] = ps[i, j] + dt * dpsi[i, j] + (dt_sqrt * dxdz_in_sqrt * eta * beta_ij) ph_new[i, j] = math.tanh(ps_new[i, j] / sqrt2)
def scatter_photons_electrons_cuda( N_batch, batch_size, photon_old_Ntot, elec_Ntot, cumul_nscatter_per_batch, nscatter_per_elec, random_states, photon_p, photon_px, photon_py, photon_pz, photon_x, photon_y, photon_z, photon_inv_gamma, photon_ux, photon_uy, photon_uz, photon_w, elec_x, elec_y, elec_z, elec_inv_gamma, elec_ux, elec_uy, elec_uz, elec_w, inv_ratio_w_elec_photon): """ Given the number of photons that are emitted by each electron macroparticle, determine the properties (momentum, energy) of each scattered photon and fill the arrays `photon_*` accordingly. Also, apply a recoil on the electrons. """ # Loop over batches of particles i_batch = cuda.grid(1) if i_batch < N_batch: # Photon index: this is incremented each time # a scattered photon is identified i_photon = photon_old_Ntot + cumul_nscatter_per_batch[i_batch] # Loop through the electrons in this batch N_max = min((i_batch + 1) * batch_size, elec_Ntot) for i_elec in range(i_batch * batch_size, N_max): # Prepare calculation of scattered photons from this electron if nscatter_per_elec[i_elec] > 0: # Prepare Lorentz transformation to the electron rest frame elec_gamma = 1. / elec_inv_gamma[i_elec] elec_u = math.sqrt(elec_ux[i_elec]**2 + elec_uy[i_elec]**2 + elec_uz[i_elec]**2) elec_beta = elec_u * elec_inv_gamma[i_elec] if elec_u != 0: elec_inv_u = 1. / elec_u elec_nx = elec_inv_u * elec_ux[i_elec] elec_ny = elec_inv_u * elec_uy[i_elec] elec_nz = elec_inv_u * elec_uz[i_elec] else: # Avoid division by 0; provide arbitrary direction # for the Lorentz transform (since beta=0 anyway) elec_nx = 0. elec_ny = 0. elec_nz = 1. # Transform momentum of photon to the electron rest frame photon_rest_p, photon_rest_px, \ photon_rest_py, photon_rest_pz = lorentz_transform( photon_p, photon_px, photon_py, photon_pz, elec_gamma, elec_beta, elec_nx, elec_ny, elec_nz ) # Find cos and sin of the spherical angle that represent # the direction of the incoming photon in the rest frame cos_theta = photon_rest_pz / photon_rest_p if cos_theta**2 < 1: sin_theta = math.sqrt(1 - cos_theta**2) inv_photon_rest_pxy = 1. / (sin_theta * photon_rest_p) cos_phi = photon_rest_px * inv_photon_rest_pxy sin_phi = photon_rest_py * inv_photon_rest_pxy else: sin_theta = 0 # Avoid division by 0; provide arbitrary direction # for the phi angle (since theta is 0 or pi anyway) cos_phi = 1. sin_phi = 0. # Loop through the number of scatterings for this electron for i_scat in range(nscatter_per_elec[i_elec]): # Draw scattering angle in the rest frame, from the # Klein-Nishina cross-section (See Ozmutl, E. N. # "Sampling of Angular Distribution in Compton Scattering" # Appl. Radiat. Isot. 43, 6, pp. 713-715 (1992)) k = photon_rest_p * INV_MC c0 = 2. * (2. * k**2 + 2. * k + 1.) / (2. * k + 1.)**3 b = (2. + c0) / (2. - c0) a = 2. * b - 1. # Use rejection method to draw x reject = True while reject: # - Draw x with an approximate probability distribution r1 = xoroshiro128p_uniform_float64(random_states, i_batch) x = b - (b + 1.) * (0.5 * c0)**r1 # - Calculate approximate probability distribution h h = a / (b - x) # - Calculate expected (exact) probability distribution f factor = 1 + k * (1 - x) f = ((1 + x**2) * factor + k**2 * (1 - x)**2) / factor**3 # - Keep x according to rejection rule r2 = xoroshiro128p_uniform_float64(random_states, i_batch) if r2 < f / h: reject = False # Get scattered momentum in the rest frame new_photon_rest_p = photon_rest_p / (1 + k * (1 - x)) # - First in a system of axes aligned with the incoming photon cos_theta_s = x sin_theta_s = math.sqrt(1 - x**2) r3 = xoroshiro128p_uniform_float64(random_states, i_batch) phi_s = 2 * math.pi * r3 cos_phi_s = math.cos(phi_s) sin_phi_s = math.sin(phi_s) new_photon_rest_pX = new_photon_rest_p * sin_theta_s * cos_phi_s new_photon_rest_pY = new_photon_rest_p * sin_theta_s * sin_phi_s new_photon_rest_pZ = new_photon_rest_p * cos_theta_s # - Then rotate it to the original system of axes new_photon_rest_px = sin_theta * cos_phi * new_photon_rest_pZ \ + cos_theta * cos_phi * new_photon_rest_pX \ - sin_phi * new_photon_rest_pY new_photon_rest_py = sin_theta * sin_phi * new_photon_rest_pZ \ + cos_theta * sin_phi * new_photon_rest_pX \ + cos_phi * new_photon_rest_pY new_photon_rest_pz = cos_theta * new_photon_rest_pZ \ - sin_theta * new_photon_rest_pX # Transform momentum of photon back to the simulation frame # (i.e. Lorentz transform with opposite direction) new_photon_p, new_photon_px, new_photon_py, new_photon_pz = \ lorentz_transform( new_photon_rest_p, new_photon_rest_px, new_photon_rest_py, new_photon_rest_pz, elec_gamma, elec_beta, -elec_nx, -elec_ny, -elec_nz) # Create the new photon by copying the electron position photon_x[i_photon] = elec_x[i_elec] photon_y[i_photon] = elec_y[i_elec] photon_z[i_photon] = elec_z[i_elec] photon_w[i_photon] = elec_w[i_elec] * inv_ratio_w_elec_photon # The photon's ux, uy, uz corresponds to the actual px, py, pz photon_ux[i_photon] = new_photon_px photon_uy[i_photon] = new_photon_py photon_uz[i_photon] = new_photon_pz # The photon's inv_gamma corresponds to 1./p (consistent # with the code for the particle pusher and for the # openPMD back-transformed diagnostics) photon_inv_gamma[i_photon] = 1. / new_photon_p # Update the photon index i_photon += 1 # Add recoil to electrons # Note: In order to reproduce the right distribution of electron # momentum, the electrons should recoil with the momentum # of *one single* photon, with a probability p (calculated by # get_scattering_probability). Here we reuse the momentum of # the last photon generated above. This requires that at least one # photon be created for this electron, which occurs with a # probability p*ratio_w_elec_photon. Thus, given that at least one # photon has been created, we should add recoil to the corresponding # electron only with a probability inv_ratio_w_elec_photon. if nscatter_per_elec[i_elec] > 0: r = xoroshiro128p_uniform_float64(random_states, i_batch) if r < inv_ratio_w_elec_photon: elec_ux[i_elec] += INV_MC * (photon_px - new_photon_px) elec_uy[i_elec] += INV_MC * (photon_py - new_photon_py) elec_uz[i_elec] += INV_MC * (photon_pz - new_photon_pz)