示例#1
0
def repulsion(first_permutation: List[int], second_permutation: List[int],
              random_states):
    dimension = len(first_permutation)
    thread_id = cuda.threadIdx.x
    distance = hamming_distance(first_permutation, second_permutation)
    new_distance = 0
    iterations = 0

    while distance >= new_distance:
        iterations += 1
        first_index = int(
            xoroshiro128p_uniform_float64(random_states, thread_id) *
            dimension)
        second_index = int(
            xoroshiro128p_uniform_float64(random_states, thread_id) *
            dimension)

        tmp = second_permutation[first_index]
        second_permutation[first_index] = second_permutation[second_index]
        second_permutation[second_index] = tmp

        new_distance = hamming_distance(first_permutation, second_permutation)

        if iterations == dimension:
            break
def rejection_resample(state_array, state_buffer, weight, rng,
                       weight_max_array):
    '''
    General resample function for particle filter.
    In paper : Parallel Resampling in the Particle Filter (Journal of Computational and Graphical Statistics)
    :param state_array:
    :param state_buffer: buffer array, its size same to state_array
    :param weight:
    :param rng:random generator(pyculib)
    :param weight_max_array: buffer , its size same to weight.
    :return:
    '''
    pos = cuda.grid(1)
    tid = cuda.threadIdx.x

    sdata = cuda.shared.array(shape=(1024), dtype=float64)
    if pos < state_array.shape[1]:
        if pos == 0:
            weight_max_array[0] = 0.0
        # state_buffer[:, pos] = state_array[:, pos]
        for i in range(state_buffer.shape[0]):
            state_buffer[i, pos] = state_array[i, pos]

        sdata[tid] = weight[pos]
        s = cuda.blockDim.x >> 1
        while s > 0:
            if tid < s:
                sdata[tid] = max(sdata[tid], sdata[tid + s])
            s = s >> 1
            cuda.syncthreads()
        if tid == 0:
            cuda.atomic.max(weight_max_array, 0, sdata[0])

        j = pos
        u = xoroshiro128p_uniform_float64(rng, pos)
        counter = 0
        while u > weight[j] / weight_max_array[0] and counter < 1000:
            counter = counter + 1
            j = int(
                math.ceil(
                    xoroshiro128p_uniform_float64(rng, pos) *
                    state_array.shape[1]))
            u = xoroshiro128p_uniform_float64(rng, pos)
        # state_array[:, pos] = state_buffer[:, j]
        for i in range(state_buffer.shape[0]):
            state_array[i, pos] = state_buffer[i, j]
        weight[pos] = 1.0 / float64(state_array.shape[1])
        cuda.syncthreads()
示例#3
0
def cuda_step_stick(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, orientation):
    """Kernel function for 1D diffusion"""
    
    # Global thread index on a 1D grid
    thread_id = cuda.grid(1)
    if thread_id >= n_of_spins:
        return

    # Allocate local memory
    step = cuda.local.array(3, numba.double)

    # Generate random step
    if xoroshiro128p_uniform_float64(rng_states, thread_id) > .5:
        step[0] = orientation[0] * step_length
        step[1] = orientation[1] * step_length
        step[2] = orientation[2] * step_length
    else:
        step[0] = -orientation[0] * step_length
        step[1] = -orientation[1] * step_length
        step[2] = -orientation[2] * step_length

    # Update positions
    positions[0, thread_id] = positions[0, thread_id] + step[0]
    positions[1, thread_id] = positions[1, thread_id] + step[1]
    positions[2, thread_id] = positions[2, thread_id] + step[2]
    
    # Calculate phase shift
    for measurement in range(g_x.shape[1]):
        phases[measurement, thread_id] += gamma * dt * \
                                          (g_x[time_point, measurement] * positions[0, thread_id] + \
                                           g_y[time_point, measurement] * positions[1, thread_id] + \
                                           g_z[time_point, measurement] * positions[2, thread_id])
示例#4
0
def cuda_step_plane(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, directions):
    """Kernel function for 2D diffusion"""
    
    # Global thread index on a 1D grid
    thread_id = cuda.grid(1)
    if thread_id >= n_of_spins:
        return

    # Allocate local memory
    step = cuda.local.array(3, numba.double)

    # Generate random step
    phi = xoroshiro128p_uniform_float64(rng_states, thread_id) * 6.283185307179586
    step[0] = math.cos(phi) * directions[0] + math.sin(phi) * directions[3]
    step[1] = math.cos(phi) * directions[1] + math.sin(phi) * directions[4]
    step[2] = math.cos(phi) * directions[2] + math.sin(phi) * directions[5]
    step[0] = step_length * step[0]
    step[1] = step_length * step[1]
    step[2] = step_length *  step[2]

    # Update positions
    positions[0, thread_id] = positions[0, thread_id] + step[0]
    positions[1, thread_id] = positions[1, thread_id] + step[1]
    positions[2, thread_id] = positions[2, thread_id] + step[2]
    
    # Calculate phase shift
    for measurement in range(g_x.shape[1]):
        phases[measurement, thread_id] += gamma * dt * \
                                          (g_x[time_point, measurement] * positions[0, thread_id] + \
                                           g_y[time_point, measurement] * positions[1, thread_id] + \
                                           g_z[time_point, measurement] * positions[2, thread_id])
示例#5
0
    def integration_kernel(MCresult, domain, parameters, domain_range, total_size, batch_size, 
                           i_batch, rng_states, num_points, parameter_shape,parameter_off_set):

        thread_id = cuda.grid(1)
        
        if thread_id < batch_size:
            parameter_id = thread_id + i_batch * batch_size

            if parameter_id < total_size:

                # local array to save current parameter grid value
                aa = cuda.local.array(shape=num_parameters, dtype=nb.int32)
                for i in range(num_parameters):
                    aa[i] = 0
                unravel(num_parameters,parameter_shape,parameter_id,aa)
                
                # turn aa into one-dimensional
                for i in range(num_parameters-1):
                    aa[i+1] = aa[i+1]+parameter_off_set[i]

                # feed in parameter values to aa
                for i in range(num_parameters):
                    aa[i] = parameters[aa[i]]

                for i_sample in range(num_points):
                    
                    x_tuple = cuda.local.array(shape=dim, dtype=nb.float64)
                    
                    for j_dim in range(dim):
                        x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id) * domain_range[j_dim] + domain[j_dim][0]

                    # feed in values to user defined function, 
                    # and add all points' corresponding results in one chunk
                    cuda.atomic.add(MCresult, thread_id, fun(x_tuple, aa))
    def integration_kernel(MCresult, num_points_in_one_chunk, num_chunks_in_one_dimension,
                           domain, domain_range, batch_size, i_batch, rng_states, num_chunks):

        thread_id = cuda.grid(1)
        if thread_id < batch_size:
            chunk_id = thread_id + i_batch * batch_size

            if chunk_id < num_chunks:

                # local digits index for each thread
                digit_store = cuda.local.array(shape=dim, dtype=nb.int64)
                for i_temp in range(dim):
                    digit_store[i_temp] = 0

                # convert one_dim index to n_dim index
                # result will be stored in digit_store
                oneD_to_nD(num_chunks_in_one_dimension, chunk_id, digit_store)

                # specify the local domain
                domain_left = cuda.local.array(shape=dim, dtype=nb.float64)
                for j_dim in range(dim):
                    domain_left[j_dim] = domain[j_dim][0] + digit_store[j_dim] * domain_range[j_dim]

                for i_sample in range(num_points_in_one_chunk):
                    # x_tuple: local axis values for each thread
                    x_tuple = cuda.local.array(shape=dim, dtype=nb.float64)

                    for j_dim in range(dim):
                        x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id) * domain_range[j_dim] + domain_left[j_dim]

                    # feed in values to user defined function, 
                    # and add all points' corresponding results in one chunk
                    cuda.atomic.add(MCresult, thread_id, fun(x_tuple))
示例#7
0
        def integration_kernel(num_loops,                               MCresult,                               chunk_size,                               n_chunk_x,                               domain,                               domain_range,                               batch_size,                               i_batch,                               rng_states,                               n_chunk):

            thread_id = cuda.grid(1)
            if thread_id < batch_size:
                chunk_id = thread_id + i_batch * batch_size

                if chunk_id < n_chunk:

                    # digit_store: local digits index for each thread
                    digit_store = cuda.local.array(shape=dim, dtype=nb.int64)
                    for i_temp in range(dim):
                        digit_store[i_temp] = 0

                    # convert one_d index to dim_d index
                    # result will be stored in digit_store
                    oneD_to_nD(n_chunk_x,chunk_id,digit_store)

                    # specisify the local domain
                    domain_left = cuda.local.array(dim, dtype=nb.float64)
                    for j_dim in range(dim):
                        domain_left[j_dim] = domain[j_dim][0] + digit_store[j_dim] * domain_range[j_dim]

                    for i_sample in range(chunk_size):
                        # x_tuple: local axis values for each thread
                        x_tuple = cuda.local.array(dim, dtype=nb.float64)

                        for j_dim in range(dim):
                            x_tuple[j_dim] = xoroshiro128p_uniform_float64(rng_states, thread_id)                                                            *domain_range[j_dim] + domain_left[j_dim]

                        # feed in values to user defined function
                        cuda.atomic.add(MCresult, thread_id, fun(x_tuple))
示例#8
0
def em_discrete(previous_permutations: List[List[int]], next_permutations: List[List[int]], qap_values: List[int],
                weights: List[List[int]], distances: List[List[int]], max_hamming_distance: int, random_states,
                pmx_buffer):
    thread_id = cuda.threadIdx.x

    if thread_id < len(previous_permutations):
        dimension = len(previous_permutations[thread_id])
        qap_values[thread_id] = qap_device(previous_permutations[thread_id], weights, distances)

        cuda.syncthreads()

        best_value_index = 0

        for i in range(len(previous_permutations)):
            if qap_values[best_value_index] > qap_values[i]:
                best_value_index = i

        cuda.syncthreads()

        # copy current permutation to next permutation
        for i in range(dimension):
            next_permutations[thread_id][i] = previous_permutations[thread_id][i]

        if thread_id == best_value_index:
            return

        # search surroundings
        for i in range(len(previous_permutations)):
            if i == thread_id:
                continue

            if hamming_distance(previous_permutations[thread_id], previous_permutations[thread_id]) < max_hamming_distance:
                if qap_values[thread_id] > qap_values[i]:
                    first_bound = int(xoroshiro128p_uniform_float64(random_states, thread_id))
                    second_bound = int(xoroshiro128p_uniform_float64(random_states, thread_id))
                    lower_bound = min(first_bound, second_bound)
                    upper_bound = max(first_bound, second_bound)

                    pmx(previous_permutations[i], next_permutations[thread_id], lower_bound, upper_bound,
                        pmx_buffer[thread_id])

                    for j in range(dimension):
                        next_permutations[thread_id][j] = pmx_buffer[thread_id][j]

                else:
                    repulsion(previous_permutations[i], next_permutations[thread_id], random_states)
示例#9
0
def rng_kernel_float64(states, out, count, distribution):
    thread_id = cuda.grid(1)

    for i in range(count):
        if distribution == UNIFORM:
            out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id)
        elif distribution == NORMAL:
            out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)
示例#10
0
def random_flip(spins, shape_shifts, rng_states):
    tindex = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    index = 0
    num_dim = shape_shifts.size
    for a in range(num_dim - 1, -1, -1):
        index <<= shape_shifts[a]
        index += np.int64(
            ncrand.xoroshiro128p_uniform_float64(rng_states, tindex) *
            (1 << shape_shifts[a]))
    spins[index >> 3] ^= 1 << (index & 7)
示例#11
0
def determine_scatterings_cuda(N_batch, batch_size, elec_Ntot,
                               nscatter_per_elec, nscatter_per_batch,
                               random_states, dt, elec_ux, elec_uy, elec_uz,
                               elec_inv_gamma, ratio_w_electron_photon,
                               photon_n, photon_p, photon_beta_x,
                               photon_beta_y, photon_beta_z):
    """
    For each electron macroparticle, decide how many photon macroparticles
    it will emit during `dt`, using the integrated Klein-Nishina formula.

    Electrons are processed in batches of size `batch_size`, with a parallel
    loop over batches. The batching allows quicker calculation of the
    total number of photons to be created.
    """
    # Loop over batches of particles
    i_batch = cuda.grid(1)
    if i_batch < N_batch:

        # Set the count of scattered particles in the batch to 0
        nscatter_per_batch[i_batch] = 0

        # Loop through the batch
        # (Note: a while loop is used here, because numba 0.34 does
        # not support nested prange and range loops)
        N_max = min((i_batch + 1) * batch_size, elec_Ntot)
        ip = i_batch * batch_size
        while ip < N_max:

            # Set the count of scattered photons for this electron to 0
            nscatter_per_elec[ip] = 0

            # For each electron, calculate the probability of scattering
            p = get_scattering_probability(dt, elec_ux[ip], elec_uy[ip],
                                           elec_uz[ip], elec_inv_gamma[ip],
                                           photon_n[ip], photon_p,
                                           photon_beta_x, photon_beta_y,
                                           photon_beta_z)

            # Determine the number of photons produced by this electron
            r = xoroshiro128p_uniform_float64(random_states, i_batch)
            nscatter = int(p * ratio_w_electron_photon + r)
            # Note: if p is 0, the above formula will return nscatter=0
            # since r is in [0, 1). Similarly, if p is very small,
            # nscatter will be 1 with probabiliy p * ratio_w_electron_photon,
            # and 0 otherwise.
            nscatter_per_elec[ip] = nscatter
            nscatter_per_batch[i_batch] += nscatter

            # Increment ip
            ip = ip + 1
示例#12
0
def weight_mutation(rng_states, out, mutation_rate):
    #find the position of element to operate on
    x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    if x >= out.shape[0] and y >= out.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return
    #generate the random number.
    rand = xoroshiro128p_uniform_float64(rng_states, x * out.shape[1] + y)
    if (rand < mutation_rate):

        out[x][y] = out[x][y] + rand - 0.5
示例#13
0
def biase_crossover(rng_states, out, mutation_rate, mother):
    #find the position of element to operate on
    x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x >= out.shape[0] and y >= out.shape[1]:
        return
    #generate random number
    rand = xoroshiro128p_uniform_float64(rng_states, x * out.shape[1] + y)
    #temp[x]=rand
    #condition for crossover
    if (rand < mutation_rate):

        out[x][y] = mother[x][y]
示例#14
0
    def integration_kernel(MCresult, domain, domain_range, rng_states,
                           num_points):

        thread_id = cuda.grid(1)

        if thread_id < num_points:
            # local array to save random numbers
            x_tuple = cuda.local.array(shape=dim, dtype=nb.float64)

            for j_dim in range(dim):
                x_tuple[j_dim] = xoroshiro128p_uniform_float64(
                    rng_states,
                    thread_id) * domain_range[j_dim] + domain[j_dim][0]

            # accumulate the sampled results on global memory
            cuda.atomic.add(MCresult, 0, fun(x_tuple))
示例#15
0
def metropolis_step(spins, shape_shifts, temperature, field, coupling_indices,
                    coupling_constants, block_shifts, offsets, rng_states):
    """
    Multi-spin metropolis algorthm

    :param spins: spin configuration; stored as np.uint8 bytes
    :param shape_shifts: shape of lattice, as power of 2
    :param temperature: unitless temperature
    :param field: unitless applied field
    :param coupling_indices:
    :param coupling_constants:
    :param block_shifts: shape of subsubdivisions, as power of 2
    :param offsets: offsets for subdivisions
    :param rng_states: numba.cuda.random rng states
    :return:
    """
    thread_index = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    temp_index = thread_index
    num_dim = shape_shifts.size
    shift = 0
    spin_index = 0
    for a in range(num_dim - 1, -1, -1):
        spin_index <<= shape_shifts[a]
        spin_index += ((temp_index &
                        ((1 <<
                          (shape_shifts[a] - block_shifts[a] - 1)) - 1)) <<
                       (block_shifts[a] + 1))
        spin_index += np.int64(offsets[a]) << block_shifts[a]
        spin_index += rng_states[thread_index]["s0"] & (
            (1 << block_shifts[a]) - 1)
        ncrand.xoroshiro128p_next(rng_states, thread_index)
        temp_index >>= shape_shifts[a] - block_shifts[a] - 1

    delta_E = -2 * calc_single_interaction_energy(
        spin_index, spins, shape_shifts, coupling_indices, coupling_constants)
    this_spin = (spins[spin_index >> 3] >> (spin_index & 7)) & 1
    delta_E -= 2 * field * this_spin
    if (ncrand.xoroshiro128p_uniform_float64(rng_states, thread_index) <
            math.exp(-delta_E / temperature)):
        spins[spin_index >> 3] ^= 1 << (spin_index & 7)
示例#16
0
def fill_uniformly_cuda(positions, triangles, max, rng_states):
    """Cuda kernel function for calculating spin positions inside the 
        triangular mesh."""
    thread_id = cuda.grid(1)
    if thread_id >= positions.shape[1]:
        return
    inside = False
    while not inside:
        intersections = 0
        r0 = cuda.local.array(3, numba.double)
        unit_step = cuda.local.array(3, numba.double)
        r0[0] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[0]
        r0[1] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[1]
        r0[2] = xoroshiro128p_uniform_float64(rng_states, thread_id) * max[2]
        unit_step[0] = xoroshiro128p_uniform_float64(rng_states,
                                                     thread_id) - .5
        unit_step[1] = xoroshiro128p_uniform_float64(rng_states,
                                                     thread_id) - .5
        unit_step[2] = xoroshiro128p_uniform_float64(rng_states,
                                                     thread_id) - .5
        normalizing_factor = math.sqrt(unit_step[0]**2 + unit_step[1]**2 +
                                       unit_step[2]**2)
        unit_step[0] = unit_step[0] / normalizing_factor
        unit_step[1] = unit_step[1] / normalizing_factor
        unit_step[2] = unit_step[2] / normalizing_factor
        for triangle_idx in range(0, len(triangles), 9):
            A = triangles[triangle_idx:triangle_idx + 3]
            B = triangles[triangle_idx + 3:triangle_idx + 6]
            C = triangles[triangle_idx + 6:triangle_idx + 9]
            t = simulation.triangle_intersection_check(A, B, C, r0, unit_step)
            if t > 0:
                intersections = intersections + 1
        if intersections % 2 != 0:
            inside = True
    positions[0, thread_id] = r0[0]
    positions[1, thread_id] = r0[1]
    positions[2, thread_id] = r0[2]
示例#17
0
def mcmc_bench(X, Y, output, rng_states, n_iter):
    """Device code of our parallel MCMC implementation.
    """
    shared = cuda.shared.array(shape=(2**9,), dtype=float64)  # Shared Memory
    tx = cuda.threadIdx.x  # Thread ID
    ty = cuda.blockIdx.x  # Block ID
    bw = cuda.blockDim.x  # Block Size
    idx = bw*ty+tx  # Global ID
    
    alpha, beta0, beta1, sigma = 0, 0, 0, 1
    x = X[idx]  # Fetch the data point
    y = Y[idx]
    mu = alpha + beta0*x[0] + beta1*x[1]
    logp_xy = -((y-mu)**2)/(2*(sigma**2)) - math.log(sigma)  # Log-likelihood of the data point
    shared[tx] = logp_xy  # Put the log-likelihood to the shared memory
    cuda.syncthreads()
    
    # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance
    s = bw//2
    while s>0:
        if tx < s:
            shared[tx] += shared[tx+s]
        cuda.syncthreads()
        s>>=1
    # Get the log-likelihood of the sub-dataset from the first position
    logp = shared[0]  #  NOTE: Might cause some performance issues
    
    # Add the log-prior
    log_prior = - ((alpha**2)/(2*(10**2)) + (beta0**2)/(2*(10**2)) + (beta1**2)/(2*(10**2)) + (sigma**2)/2)
    logp += log_prior
    
    # Main MCMC Loop
    for i in range(n_iter):
        # Propose a new theta
        alpha_ = alpha + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        beta0_ = beta0 + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        beta1_ = beta1 + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        sigma_ = sigma + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        
        mu = alpha_ + beta0_*x[0] + beta1_*x[1]
        logp_xy = -((y-mu)**2)/(2*(sigma_**2)) - math.log(sigma_)
        shared[tx] = logp_xy  # Put the log-likelihood to the shared memory
        cuda.syncthreads()
        
        # Reduction using sequential addressing
        s = bw//2
        while s>0:
            if tx < s:
                shared[tx] += shared[tx+s]
            cuda.syncthreads()
            s>>=1
        # Get the log-likelihood;
        # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/   
        logp_ = shared[0]
        
        # Add the log-prior
        log_prior = - ((alpha_**2)/(2*(10**2)) + (beta0_**2)/(2*(10**2)) + (beta1_**2)/(2*(10**2)) + (sigma_**2)/2)
        logp_ += log_prior
        
        # Acceptance ratio
        gamma = math.exp(min(0,logp_-logp))
        # Draw a uniform random number
        u = xoroshiro128p_uniform_float64(rng_states, idx)
        # Accept/Reject?
        if u < gamma:
            alpha = alpha_
            beta0 = beta0_
            beta1 = beta1_
            sigma = sigma_
            logp = logp_
        
        # Write the sample to the memory
        if tx == 0:
            output[i,ty,0] = alpha
            output[i,ty,1] = beta0
            output[i,ty,2] = beta1
            output[i,ty,3] = sigma
示例#18
0
 def update(ST, rng_states, _obj_i):
     ST['spike'] = random.xoroshiro128p_uniform_float64(rng_states, _obj_i) < freqs * dt
示例#19
0
def mcmc(data, output, rng_states, n_iter):
    """Device code of our parallel MCMC implementation.
    """
    shared = cuda.shared.array(shape=(2**9,), dtype=float64)  # Shared Memory
    tx = cuda.threadIdx.x  # Thread ID
    ty = cuda.blockIdx.x  # Block ID
    bw = cuda.blockDim.x  # Block Size
    idx = bw*ty+tx  # Global ID
    
    theta = (0.,0.)  # Initialize theta
    x = data[idx]  # Fetch the data point
    logp_x = -(((theta[0]-x[0])**2)/(2*0.1) + ((theta[1]-x[1])**2)/(2*0.1))  # Log-likelihood of the data point
    shared[tx] = logp_x  # Put the log-likelihood to the shared memory
    cuda.syncthreads()
    
    # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance
    s = bw//2
    while s>0:
        if tx < s:
            shared[tx] += shared[tx+s]
        cuda.syncthreads()
        s>>=1
    # Get the log-likelihood of the sub-dataset from the first position
    logp = shared[0]  #  NOTE: Might cause some performance issues
    
    # Add the log-prior
    log_prior = -(((theta[0]-1)**2)/2 + ((theta[1]-1)**2)/2) 
    logp += log_prior/2
    
    # Main MCMC Loop
    for i in range(n_iter):
        # Propose a new theta
        theta_ = (theta[0] + 0.1*xoroshiro128p_normal_float64(rng_states, idx), theta[1] + 0.1*xoroshiro128p_normal_float64(rng_states, idx))
        logp_x = -(((theta_[0]-x[0])**2)/(2*0.1) + ((theta_[1]-x[1])**2)/(2*0.1))  # Log-likelihood of the data point
        shared[tx] = logp_x  # Put the log-likelihood to the shared memory
        cuda.syncthreads()
        
        # Reduction using sequential addressing
        s = bw//2
        while s>0:
            if tx < s:
                shared[tx] += shared[tx+s]
            cuda.syncthreads()
            s>>=1
        # Get the log-likelihood;
        # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/   
        logp_ = shared[0]
        
        # Add the log-prior
        log_prior = -(((theta_[0]-1)**2)/2 + ((theta_[1]-1)**2)/2) 
        logp_ += log_prior/2
        
        # Acceptance ratio
        alpha = math.exp(min(0,logp_-logp))
        # Draw a uniform random number
        u = xoroshiro128p_uniform_float64(rng_states, idx)
        # Accept/Reject?
        if u < alpha:
            theta = theta_
            logp = logp_
        
        # Write the sample to the memory
        if tx == 0:
            output[i,ty] = theta
示例#20
0
def _cuda_fill_mesh(
    points,
    rng_states,
    intra,
    vertices,
    faces,
    voxel_size,
    triangle_indices,
    subvoxel_indices,
    xs,
    ys,
    zs,
    n_sv,
):
    """Kernel function for efficiently sampling points from a uniform
    distribution inside or outside the surface defined by the triangular
    mesh."""

    thread_id = cuda.grid(1)
    if thread_id >= points.shape[0] or points[thread_id, 0] != math.inf:
        return

    point = cuda.local.array(3, numba.float64)
    for i in range(3):
        point[i] = xoroshiro128p_uniform_float64(rng_states,
                                                 thread_id) * voxel_size[i]
    ray = cuda.local.array(3, numba.float64)
    ray[0] = 1.0
    ray[1] = 0.0
    ray[2] = 0.0

    # Find the subvoxels the ray intersects
    lls = cuda.local.array(3, numba.int64)
    uls = cuda.local.array(3, numba.int64)
    lls[0] = _ll_subvoxel_overlap(xs, point[0], point[0] + ray[0])
    lls[1] = _ll_subvoxel_overlap(ys, point[1], point[1] + ray[1])
    lls[2] = _ll_subvoxel_overlap(zs, point[2], point[2] + ray[2])
    uls[0] = _ul_subvoxel_overlap(xs, point[0], point[0] + ray[0])
    uls[1] = _ul_subvoxel_overlap(ys, point[1], point[1] + ray[1])
    uls[2] = _ul_subvoxel_overlap(zs, point[2], point[2] + ray[2])

    # Keep track of the number of intersections and the triangles. The max
    # number of intersections allowed is 1000. Increase this number for very
    # complex meshes.
    n_intersections = 0
    triangle = cuda.local.array((3, 3), numba.float64)
    triangles = cuda.local.array(1000, numba.int64)

    # Loop over the subvoxels
    for x in range(lls[0], uls[0]):
        for y in range(lls[1], uls[1]):
            for z in range(lls[2], uls[2]):
                sv = int(x * n_sv[1] * n_sv[2] + y * n_sv[2] + z)

                # Loop over the triangles
                for i in range(subvoxel_indices[sv, 0], subvoxel_indices[sv,
                                                                         1]):

                    if n_intersections >= 1000:
                        return

                    _cuda_get_triangle(triangle_indices[i], vertices, faces,
                                       triangle)
                    d = _cuda_ray_triangle_intersection_check(
                        triangle, point, ray)

                    if d > 0:
                        already_intersected = False
                        for j in triangles[0:n_intersections]:
                            if j == triangle_indices[i]:
                                already_intersected = True
                                break
                        if not already_intersected:
                            triangles[n_intersections] = triangle_indices[i]
                            n_intersections += 1

    if intra:
        if n_intersections % 2 == 1:  # Point is inside the surface
            for i in range(3):
                points[thread_id, i] = point[i]
    else:
        if n_intersections % 2 == 0:  # Point is outside the surface
            for i in range(3):
                points[thread_id, i] = point[i]
    return
示例#21
0
def rand(rng_states):
    return xoroshiro128p_uniform_float64(rng_states, cuda.grid(1))
示例#22
0
def rhs_psi(ps, ph, U, ps_new, ph_new, U_new, zz, dpsi, intR, lT_tilde, t_cur,
            rng_states):
    # ps = psi, ph = phi

    i, j = cuda.grid(2)
    m, n = ps.shape

    # thread on interior points
    if 0 < i < m - 1 and 0 < j < n - 1:

        # =============================================================
        #
        # 1. ANISOTROPIC DIFFUSION
        #
        # =============================================================

        # these ps's are defined on cell centers
        psipjp = (ps[i + 1, j + 1] + ps[i + 0, j + 1] + ps[i + 0, j + 0] +
                  ps[i + 1, j + 0]) * 0.25
        psipjm = (ps[i + 1, j + 0] + ps[i + 0, j + 0] + ps[i + 0, j - 1] +
                  ps[i + 1, j - 1]) * 0.25
        psimjp = (ps[i + 0, j + 1] + ps[i - 1, j + 1] + ps[i - 1, j + 0] +
                  ps[i + 0, j + 0]) * 0.25
        psimjm = (ps[i + 0, j + 0] + ps[i - 1, j + 0] + ps[i - 1, j - 1] +
                  ps[i + 0, j - 1]) * 0.25

        phipjp = (ph[i + 1, j + 1] + ph[i + 0, j + 1] + ph[i + 0, j + 0] +
                  ph[i + 1, j + 0]) * 0.25
        phipjm = (ph[i + 1, j + 0] + ph[i + 0, j + 0] + ph[i + 0, j - 1] +
                  ph[i + 1, j - 1]) * 0.25
        phimjp = (ph[i + 0, j + 1] + ph[i - 1, j + 1] + ph[i - 1, j + 0] +
                  ph[i + 0, j + 0]) * 0.25
        phimjm = (ph[i + 0, j + 0] + ph[i - 1, j + 0] + ph[i - 1, j - 1] +
                  ph[i + 0, j - 1]) * 0.25

        # ============================
        # right edge flux
        # ============================
        psx = ps[i + 1, j + 0] - ps[i + 0, j + 0]
        psz = psipjp - psipjm
        phx = ph[i + 1, j + 0] - ph[i + 0, j + 0]
        phz = phipjp - phipjm

        A = atheta(phx, phz)
        Ap = aptheta(phx, phz)
        JR = A * (A * psx - Ap * psz)

        # ============================
        # left edge flux
        # ============================
        psx = ps[i + 0, j + 0] - ps[i - 1, j + 0]
        psz = psimjp - psimjm
        phx = ph[i + 0, j + 0] - ph[i - 1, j + 0]
        phz = phimjp - phimjm

        A = atheta(phx, phz)
        Ap = aptheta(phx, phz)
        JL = A * (A * psx - Ap * psz)

        # ============================
        # top edge flux
        # ============================
        psx = psipjp - psimjp
        psz = ps[i + 0, j + 1] - ps[i + 0, j + 0]
        phx = phipjp - phimjp
        phz = ph[i + 0, j + 1] - ph[i + 0, j + 0]

        A = atheta(phx, phz)
        Ap = aptheta(phx, phz)
        JT = A * (A * psz + Ap * psx)

        # ============================
        # bottom edge flux
        # ============================
        psx = psipjm - psimjm
        psz = ps[i + 0, j + 0] - ps[i + 0, j - 1]
        phx = phipjm - phimjm
        phz = ph[i + 0, j + 0] - ph[i + 0, j - 1]

        A = atheta(phx, phz)
        Ap = aptheta(phx, phz)
        JB = A * (A * psz + Ap * psx)

        # =============================================================
        #
        # 2. EXTRA TERM: sqrt2 * atheta**2 * phi * |grad psi|^2
        #
        # =============================================================

        # d(phi)/dx  d(psi)/dx d(phi)/dz  d(psi)/dz at nodes (i,j)
        phxn = (ph[i + 1, j + 0] - ph[i - 1, j + 0]) * 0.5
        phzn = (ph[i + 0, j + 1] - ph[i + 0, j - 1]) * 0.5
        psxn = (ps[i + 1, j + 0] - ps[i - 1, j + 0]) * 0.5
        pszn = (ps[i + 0, j + 1] - ps[i + 0, j - 1]) * 0.5

        A2 = atheta(phxn, phzn)**2
        gradps2 = (psxn)**2 + (pszn)**2
        extra = -sqrt2 * A2 * ph[i, j] * gradps2

        # =============================================================
        #
        # 3. double well (transformed): sqrt2 * phi + nonlinear terms
        #
        # =============================================================

        # print(lT_tilde)
        # Up = (zz[i,j] - R_tilde * (nt*dt) )/lT_tilde
        # Up = (zz[i,j]-z0 - R_tilde * (nt*dt) )/lT_tilde
        Up = (zz[j] - intR) / lT_tilde

        rhs_psi = ((JR-JL) + (JT-JB) + extra) * hi**2 + \
                   sqrt2*ph[i,j] - lamd*(1-ph[i,j]**2)*sqrt2*(U[i,j] + Up)

        # =============================================================
        #
        # 4. dpsi/dt term
        #
        # =============================================================
        tp = (1 - (1 - k) * Up)
        tau_psi = tp * A2 if tp >= k else k * A2

        dpsi[i, j] = rhs_psi / tau_psi  # + eta*(random()-0.5)/dt_sr

        #x = xoroshiro128p_uniform_float64(rng_states, thread_id)
        threadID = j * m + i
        beta_ij = xoroshiro128p_uniform_float64(
            rng_states, threadID) - 0.5  # rand from  [-0.5, 0.5]

        # update psi and phi
        ps_new[i, j] = ps[i, j] + dt * dpsi[i, j] + (dt_sqrt * dxdz_in_sqrt *
                                                     eta * beta_ij)
        ph_new[i, j] = math.tanh(ps_new[i, j] / sqrt2)
示例#23
0
def scatter_photons_electrons_cuda(
        N_batch, batch_size, photon_old_Ntot, elec_Ntot,
        cumul_nscatter_per_batch, nscatter_per_elec, random_states, photon_p,
        photon_px, photon_py, photon_pz, photon_x, photon_y, photon_z,
        photon_inv_gamma, photon_ux, photon_uy, photon_uz, photon_w, elec_x,
        elec_y, elec_z, elec_inv_gamma, elec_ux, elec_uy, elec_uz, elec_w,
        inv_ratio_w_elec_photon):
    """
    Given the number of photons that are emitted by each electron
    macroparticle, determine the properties (momentum, energy) of
    each scattered photon and fill the arrays `photon_*` accordingly.

    Also, apply a recoil on the electrons.
    """
    # Loop over batches of particles
    i_batch = cuda.grid(1)
    if i_batch < N_batch:

        # Photon index: this is incremented each time
        # a scattered photon is identified
        i_photon = photon_old_Ntot + cumul_nscatter_per_batch[i_batch]

        # Loop through the electrons in this batch
        N_max = min((i_batch + 1) * batch_size, elec_Ntot)
        for i_elec in range(i_batch * batch_size, N_max):

            # Prepare calculation of scattered photons from this electron
            if nscatter_per_elec[i_elec] > 0:

                # Prepare Lorentz transformation to the electron rest frame
                elec_gamma = 1. / elec_inv_gamma[i_elec]
                elec_u = math.sqrt(elec_ux[i_elec]**2 + elec_uy[i_elec]**2 +
                                   elec_uz[i_elec]**2)
                elec_beta = elec_u * elec_inv_gamma[i_elec]
                if elec_u != 0:
                    elec_inv_u = 1. / elec_u
                    elec_nx = elec_inv_u * elec_ux[i_elec]
                    elec_ny = elec_inv_u * elec_uy[i_elec]
                    elec_nz = elec_inv_u * elec_uz[i_elec]
                else:
                    # Avoid division by 0; provide arbitrary direction
                    # for the Lorentz transform (since beta=0 anyway)
                    elec_nx = 0.
                    elec_ny = 0.
                    elec_nz = 1.

                # Transform momentum of photon to the electron rest frame
                photon_rest_p, photon_rest_px, \
                    photon_rest_py, photon_rest_pz = lorentz_transform(
                            photon_p, photon_px, photon_py, photon_pz,
                            elec_gamma, elec_beta, elec_nx, elec_ny, elec_nz )
                # Find cos and sin of the spherical angle that represent
                # the direction of the incoming photon in the rest frame
                cos_theta = photon_rest_pz / photon_rest_p
                if cos_theta**2 < 1:
                    sin_theta = math.sqrt(1 - cos_theta**2)
                    inv_photon_rest_pxy = 1. / (sin_theta * photon_rest_p)
                    cos_phi = photon_rest_px * inv_photon_rest_pxy
                    sin_phi = photon_rest_py * inv_photon_rest_pxy
                else:
                    sin_theta = 0
                    # Avoid division by 0; provide arbitrary direction
                    # for the phi angle (since theta is 0 or pi anyway)
                    cos_phi = 1.
                    sin_phi = 0.

            # Loop through the number of scatterings for this electron
            for i_scat in range(nscatter_per_elec[i_elec]):

                # Draw scattering angle in the rest frame, from the
                # Klein-Nishina cross-section (See Ozmutl, E. N.
                # "Sampling of Angular Distribution in Compton Scattering"
                # Appl. Radiat. Isot. 43, 6, pp. 713-715 (1992))
                k = photon_rest_p * INV_MC
                c0 = 2. * (2. * k**2 + 2. * k + 1.) / (2. * k + 1.)**3
                b = (2. + c0) / (2. - c0)
                a = 2. * b - 1.
                # Use rejection method to draw x
                reject = True
                while reject:
                    # - Draw x with an approximate probability distribution
                    r1 = xoroshiro128p_uniform_float64(random_states, i_batch)
                    x = b - (b + 1.) * (0.5 * c0)**r1
                    # - Calculate approximate probability distribution h
                    h = a / (b - x)
                    # - Calculate expected (exact) probability distribution f
                    factor = 1 + k * (1 - x)
                    f = ((1 + x**2) * factor + k**2 * (1 - x)**2) / factor**3
                    # - Keep x according to rejection rule
                    r2 = xoroshiro128p_uniform_float64(random_states, i_batch)
                    if r2 < f / h:
                        reject = False

                # Get scattered momentum in the rest frame
                new_photon_rest_p = photon_rest_p / (1 + k * (1 - x))
                # - First in a system of axes aligned with the incoming photon
                cos_theta_s = x
                sin_theta_s = math.sqrt(1 - x**2)
                r3 = xoroshiro128p_uniform_float64(random_states, i_batch)
                phi_s = 2 * math.pi * r3
                cos_phi_s = math.cos(phi_s)
                sin_phi_s = math.sin(phi_s)
                new_photon_rest_pX = new_photon_rest_p * sin_theta_s * cos_phi_s
                new_photon_rest_pY = new_photon_rest_p * sin_theta_s * sin_phi_s
                new_photon_rest_pZ = new_photon_rest_p * cos_theta_s
                # - Then rotate it to the original system of axes
                new_photon_rest_px = sin_theta * cos_phi * new_photon_rest_pZ \
                                   + cos_theta * cos_phi * new_photon_rest_pX \
                                               - sin_phi * new_photon_rest_pY
                new_photon_rest_py = sin_theta * sin_phi * new_photon_rest_pZ \
                                   + cos_theta * sin_phi * new_photon_rest_pX \
                                               + cos_phi * new_photon_rest_pY
                new_photon_rest_pz = cos_theta * new_photon_rest_pZ \
                                   - sin_theta * new_photon_rest_pX

                # Transform momentum of photon back to the simulation frame
                # (i.e. Lorentz transform with opposite direction)
                new_photon_p, new_photon_px, new_photon_py, new_photon_pz = \
                    lorentz_transform(
                        new_photon_rest_p, new_photon_rest_px,
                        new_photon_rest_py, new_photon_rest_pz,
                        elec_gamma, elec_beta, -elec_nx, -elec_ny, -elec_nz)

                # Create the new photon by copying the electron position
                photon_x[i_photon] = elec_x[i_elec]
                photon_y[i_photon] = elec_y[i_elec]
                photon_z[i_photon] = elec_z[i_elec]
                photon_w[i_photon] = elec_w[i_elec] * inv_ratio_w_elec_photon
                # The photon's ux, uy, uz corresponds to the actual px, py, pz
                photon_ux[i_photon] = new_photon_px
                photon_uy[i_photon] = new_photon_py
                photon_uz[i_photon] = new_photon_pz
                # The photon's inv_gamma corresponds to 1./p (consistent
                # with the code for the particle pusher and for the
                # openPMD back-transformed diagnostics)
                photon_inv_gamma[i_photon] = 1. / new_photon_p

                # Update the photon index
                i_photon += 1

            # Add recoil to electrons
            # Note: In order to reproduce the right distribution of electron
            # momentum, the electrons should recoil with the momentum
            # of *one single* photon, with a probability p (calculated by
            # get_scattering_probability). Here we reuse the momentum of
            # the last photon generated above. This requires that at least one
            # photon be created for this electron, which occurs with a
            # probability p*ratio_w_elec_photon. Thus, given that at least one
            # photon has been created, we should add recoil to the corresponding
            # electron only with a probability inv_ratio_w_elec_photon.
            if nscatter_per_elec[i_elec] > 0:
                r = xoroshiro128p_uniform_float64(random_states, i_batch)
                if r < inv_ratio_w_elec_photon:
                    elec_ux[i_elec] += INV_MC * (photon_px - new_photon_px)
                    elec_uy[i_elec] += INV_MC * (photon_py - new_photon_py)
                    elec_uz[i_elec] += INV_MC * (photon_pz - new_photon_pz)