def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0): if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block,1,1), grid=(blocks,1)) else: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block,1,1), grid=(blocks,1)) cuda.Context.get_current().synchronize()
def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx)
def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
def concatenate_layers(layers): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) nthreads_per_block = 256 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block, max_blocks=10000): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) return nodes.get(), layer_bounds
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) return gpu_parent_nodes.get()
def copy_queue(self, queue_gpu, nphotons, nthreads_per_block=64, max_blocks=1024, start_photon=0): # Allocate new storage space pos = ga.empty(shape=nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=nphotons, dtype=np.float32) t = ga.empty(shape=nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=nphotons, dtype=np.int32) flags = ga.empty(shape=nphotons, dtype=np.uint32) weights = ga.empty(shape=nphotons, dtype=np.float32) evidx = ga.empty(shape=nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if nphotons > 0: for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_queue(np.int32(start_photon+first_photon), np.int32(photons_this_round), queue_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block,1,1), grid=(blocks, 1)) return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx)
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) return gpu_parent_nodes.get()
def concatenate_layers(layers): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) nthreads_per_block = 256 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block, max_blocks=10000): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) return nodes.get(), layer_bounds
def __init__(self, photons, ncopies=1): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies-1), np.int32(nphotons), block=(nthreads_per_block,1,1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def marshall_photons(self, photons, ncopies): """ Assign the provided photons to the beginning (possibly the entire array if ncopies is 1 """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 block = (nthreads_per_block, 1, 1) for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): pass grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), ) self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid) pass pass
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) while step < max_steps: # Just finish the rest of the steps if the # of photons is low if nphotons < nthreads_per_block * 16 * 8 or use_weights: nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" cuda.Context.get_current().synchronize()
def color_solids(self, solid_hit, colors, nblocks_per_thread=64, max_blocks=1024): solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) module = get_cu_module('mesh.h', options=cuda_options) color_solids = module.get_function('color_solids') for first_triangle, triangles_this_round, blocks in \ chunk_iterator(self.triangles.size, nblocks_per_thread, max_blocks): color_solids(np.int32(first_triangle), np.int32(triangles_this_round), self.solid_id_map, solid_hit_gpu, solid_colors_gpu, self.gpudata, block=(nblocks_per_thread,1,1), grid=(blocks,1))
def color_solids(self, solid_hit, colors, nblocks_per_thread=64, max_blocks=1024): solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) module = get_cu_module('mesh.h', options=cuda_options) color_solids = module.get_function('color_solids') for first_triangle, triangles_this_round, blocks in \ chunk_iterator(self.triangles.size, nblocks_per_thread, max_blocks): color_solids(np.int32(first_triangle), np.int32(triangles_this_round), self.solid_id_map, solid_hit_gpu, solid_colors_gpu, self.gpudata, block=(nblocks_per_thread,1,1), grid=(blocks,1))
def compare_sampling(self, hist, reps=10): queue = cl.CommandQueue(self.context) # make cdf histogram nbins = hist.GetNbinsX() xaxis = hist.GetXaxis() intg = hist.GetIntegral() cdf_y = np.empty(nbins + 1, dtype=float) cdf_x = np.empty_like(cdf_y) cdf_x[0] = xaxis.GetBinLowEdge(1) cdf_y[0] = 0.0 for i in xrange(1, len(cdf_x)): cdf_y[i] = intg[i] cdf_x[i] = xaxis.GetBinUpEdge(i) cdf_x_gpu = cl.array.to_device(queue, cdf_x.astype(np.float32)) cdf_y_gpu = cl.array.to_device(queue, cdf_y.astype(np.float32)) block = (self.nthreads_per_block, 1, 1) grid = (1, 1) out_gpu = cl.array.empty(queue, shape=self.nthreads_per_block, dtype=np.float32) out_h = rt.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(), xaxis.GetXmax()) out_h.SetLineColor(rt.kGreen) for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(reps, self.nthreads_per_block, max_blocks=1): self.funcs.test_sample_cdf(queue, (elements_this_iter, 1, 1), None, self.rng_states.data, np.int32(len(cdf_x_gpu)), cdf_x_gpu.data, cdf_y_gpu.data, out_gpu.data) out = out_gpu.get() for v in out[:elements_this_iter]: out_h.Fill(v) prob = out_h.KolmogorovTest(hist) out_h.Write() return prob, out_h
def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def concatenate_layers(layers): nthreads_per_block = 1024 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) # allocate memory if gpuapi.is_gpu_api_cuda(): nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): totsize = 0 layer_pos = [] print layer_bounds[-1] for n, layer in enumerate(layers): layer_pos.append(totsize) print "LAYER ", n, " size=", len(layer), "start=", totsize totsize += len(layer) print "totsize: ", totsize nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4) nodes_iter_gpu = ga.to_device(queue, nodes_iter_np) nodeset_np = [] else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer = 0 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset nmax_blocks = 10000 if gpuapi.is_gpu_api_opencl(): nthreads_per_block = 256 nmax_blocks = 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks): #print " ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start if gpuapi.is_gpu_api_cuda(): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): layer_gpu = ga.to_device(queue, layer) bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1), (1, 1, 1), np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), np.uint32(layer_start), layer_gpu.data, nodes_iter_gpu.data, g_times_l=True).wait() else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer += 1 if gpuapi.is_gpu_api_cuda(): return nodes.get(), layer_bounds elif gpuapi.is_gpu_api_opencl(): return nodes_iter_gpu.get(), layer_bounds
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def get_hits(self, gpu_detector, target_flag=(0x1<<2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a map of GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, channels, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos),3)) dir = dir.get().view(np.float32).reshape((len(dir),3)) pol = pol.get().view(np.float32).reshape((len(pol),3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() channels = channels.get() hitmap = {} for chan in np.unique(channels): mask = (channels == chan).astype(bool) hitmap[chan] = event.Photons(pos[mask], dir[mask], pol[mask], wavelengths[mask], t[mask], last_hit_triangles[mask], flags[mask], weights[mask]) return hitmap
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n/2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n/2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192*50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, Mapped(flag), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i), nodes, block=(1,1,1), grid=(1,1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def merge_nodes(nodes, degree, max_ratio=None): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1)#degree) gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index+nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area/parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple((degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index+nchild:] = new_parent_nodes[index+1:-nchild+1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][new_parent_index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def get_flat_hits(self, gpu_detector, target_flag=(0x1 << 2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, no_map=False): '''GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits( np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, channels, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos), 3)) dir = dir.get().view(np.float32).reshape((len(dir), 3)) pol = pol.get().view(np.float32).reshape((len(pol), 3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() evidx = evidx.get() channels = channels.get() hitmap = {} return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx, channels)
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, track=False): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in range(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) if track: step_photon_ids = [] step_photons = [] #save the first step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) while step < max_steps: # Just finish the rest of the steps if the # of photons is low and not tracking if not track and (nphotons < nthreads_per_block * 16 * 8 or use_weights): nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) if track: #save the next step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if nphotons == 0: break if ga.max(self.flags).get() & (1 << 31): print("WARNING: ABORTED PHOTONS", file=sys.stderr) cuda.Context.get_current().synchronize() if track: return step_photon_ids, step_photons
def merge_nodes(nodes, degree, max_ratio=None): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree) gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Load Memory if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) gpu_first_child = ga.to_device(queue, first_child.astype(np.int32)) gpu_nchild = ga.to_device(queue, nchild.astype(np.int32)) nparent = len(first_child) parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # Run Kernel for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): if gpuapi.is_gpu_api_cuda(): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes.data, gpu_parent_nodes.data, gpu_first_child.data, gpu_nchild.data).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') return gpu_parent_nodes.get()
def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0, cl_context=None): """run UBooNE DAQ acquire kernels""" if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) clmaxblocks = max_blocks # We loop over all photons and bin them essentially if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq(rng_states, np.uint32(event.SURFACE_DETECT), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.adc_gpu, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), np.float32(100.0), self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq( comqueue, (photons_this_round, 1, 1), None, rng_states.data, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(nphotons), gpuphotons.t.data, gpuphotons.pos.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu.data, # -- Detector struct -- self.solid_id_to_channel_index_gpu.data, # --------------------- self.uint_adc_gpu.data, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), np.float32(100.0), self.channel_history_gpu.data, # -- Channel transforms -- self.channel_inverse_rot_gpu.data, self.channel_inverse_trans_gpu.data, # ------------------------ np.float32(weight), g_times_l=False).wait() # if opencl, need to convert ADC from uint to float if api.is_gpu_api_opencl(): self.gpu_funcs.convert_adc(comqueue, (int(self.nchannels), 1, 1), None, self.uint_adc_gpu.data, self.adc_gpu.data, np.int32(self.nchannels), np.int32(self.ntdcs), g_times_l=False).wait() else: raise RunTimeError("Multi-DAQ not built") for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq_many( rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq_many( comqueue, (nthreads_per_block, 1, 1), (blocks, 1), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu, # -- Detector Struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), g_times_l=True).wait() if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
def merge_nodes(nodes, degree, max_ratio=None): nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # determine number of parents nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree # allocate memory if gpuapi.is_gpu_api_cuda(): gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) gpu_nodes = ga.to_device(queue, nodes) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # run kernel if gpuapi.is_gpu_api_cuda(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=1): bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes.data, gpu_nodes.data, np.uint32(0), np.uint32(len(nodes))).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0, cl_context=None): if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) clmaxblocks = max_blocks if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1) self.gpu_funcs.run_daq( comqueue, (photons_this_round / nthreads_per_block, 1, 1), (nthreads_per_block, 1, 1), rng_states.data, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu.data, # -- Detector struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.float32(weight), g_times_l=True).wait() else: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq_many( rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq_many( comqueue, (nthreads_per_block, 1, 1), (blocks, 1), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu, # -- Detector Struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), g_times_l=True).wait() if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n / 2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n / 2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % ( i * 2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192 * 50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, cutools.Mapped(flag), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index + 1), np.uint32(better_i), nodes, block=(1, 1, 1), grid=(1, 1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1, seed=None, cl_context=None): """ Generates photons from information in the steps_arr Parameters ---------- steps_arr : numpy.array with shape=(N,10) dtype=np.float contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ] in the future could generalize this to many different time components. developed for liquid argon TPCs. multiple : float scale up the number of photons generated (not implemented yet) """ self.steps_array = steps_arr self.nsteps = self.steps_array.shape[0] if multiple!=1.0: raise RuntimeError('Have not implemented scaling of the number of photons generated.') # =========================== # GEN PHOTONS tstart_genphotons = time.time() # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here) # on the CPU, we scan the steps to determine the total number of photons using poisson statistics # we assume the user has seeded the random number generator to her liking tstart_nphotons = time.time() self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 ) #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int ) self.nphotons_per_step = self.steps_array[ self._nphotons, : ] self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() ) print "NSTEPS: ",self.nsteps print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons # now we make an index array for which step we need to get info from self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 ) current_index=0 for n, n_per_step in enumerate( self.nphotons_per_step ): self.source_step_index[current_index:current_index+n_per_step] = n current_index += n_per_step # push everything to the GPU tstart_transfer = time.time() if api.is_gpu_api_cuda(): # step info self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio ) self.source_step_index_gpu = ga.to_gpu( self.source_step_index ) # photon info self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) ) self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) elif api.is_gpu_api_opencl(): cl_queue = cl.CommandQueue( cl_context ) # step info self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_device( cl_queue, self.step_fsratio ) self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index ) # photon info self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) ) self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) ) self.t.set( self.steps_array[:,3] ) self.ncopies = ncopies self.true_nphotons = self.nphotons if self.ncopies!=1: raise ValueError('support for multiple copies not supported') if api.is_gpu_api_cuda(): self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True ) elif api.is_gpu_api_opencl(): self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True ) self.gpufuncs = GPUFuncs( self.gpumod ) print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer # need random numbers tgpu = time.time() if seed==None: seed = 5 rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context) for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu, self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states, self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights, block=(nthreads_per_block,1,1), grid=(blocks, 1) ) elif api.is_gpu_api_opencl(): self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data, self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states.data, self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait() else: raise RuntimeError("GPU API is neither CUDA nor OpenCL!") if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() tend_genphotons = time.time() print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")" # Now load modules if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module)
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, cl_context=None): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size # bind node texture reference if api.is_gpu_api_cuda() and not self.node_texture_ref_bound: # we have to unroll, as pycuda doesn't seem to support vector times right now for binding self.unrolled_nodes = ga.to_gpu( gpu_geometry.nodes.get().ravel().view(np.uint32)) self.unrolled_extra_nodes = ga.to_gpu( gpu_geometry.extra_nodes.ravel().view(np.uint32)) self.unrolled_triangles = ga.to_gpu( gpu_geometry.triangles.get().ravel().view(np.uint32)) self.unrolled_triangles4 = ga.to_gpu( gpu_geometry.triangles4.ravel().view(np.uint32)) self.unrolled_vertices = ga.to_gpu( gpu_geometry.vertices.get().ravel().view(np.float32)) self.unrolled_vertices4 = ga.to_gpu( gpu_geometry.vertices4.ravel().view(np.float32)) self.node_texture_ref.set_address(self.unrolled_nodes.gpudata, self.unrolled_nodes.nbytes) self.extra_node_texture_ref.set_address( self.unrolled_extra_nodes.gpudata, self.unrolled_extra_nodes.nbytes) #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref ) #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref ) #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref ) self.triangles_texture_ref.set_address( self.unrolled_triangles4.gpudata, self.unrolled_triangles4.nbytes) #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref ) self.vertices_texture_ref.set_address( self.unrolled_vertices4.gpudata, self.unrolled_vertices4.nbytes) print "[BOUND TO TEXTURE MEMORY]" print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes" print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes" print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes" print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes" print "Total: ", (self.unrolled_nodes.nbytes + self.unrolled_extra_nodes.nbytes + self.unrolled_triangles4.nbytes + self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes" self.node_texture_ref_bound = True # setup queue maxqueue = nphotons step = 0 input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons if api.is_gpu_api_cuda(): input_queue_gpu = ga.to_gpu(input_queue) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) input_queue_gpu = ga.to_device(comqueue, input_queue[1:]) # why the offset? output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32) output_queue[0] = 1 if api.is_gpu_api_cuda(): output_queue_gpu = ga.to_gpu(output_queue) elif api.is_gpu_api_opencl(): output_queue_gpu = ga.to_device(comqueue, output_queue) if use_weights: iuse_weights = 1 else: iuse_weights = 0 adapt_factor = 1.0 start_prop = time.time() while step < max_steps: # Just finish the rest of the steps if the # of photons is low #if nphotons < nthreads_per_block * 16 * 8 or use_weights: # nsteps = max_steps - step #else: # nsteps = 1 nsteps = 1 start_step = time.time() for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )): #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor start_chunk = time.time() if api.is_gpu_api_cuda(): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) #cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): self.gpu_funcs.propagate( comqueue, (photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu.data, output_queue_gpu.data, rng_states.data, self.pos.data, self.dir.data, self.wavelengths.data, self.pol.data, self.t.data, self.flags.data, self.last_hit_triangles.data, self.weights.data, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.world_scale, gpu_geometry.world_origin.data, np.int32(len(gpu_geometry.nodes)), gpu_geometry.material_data['n'], gpu_geometry.material_data['step'], gpu_geometry.material_data["wavelength0"], gpu_geometry.vertices.data, gpu_geometry.triangles.data, gpu_geometry.material_codes.data, gpu_geometry.colors.data, gpu_geometry.nodes.data, gpu_geometry.extra_nodes.data, gpu_geometry.material_data["nmaterials"], gpu_geometry.material_data['refractive_index'].data, gpu_geometry.material_data['absorption_length'].data, gpu_geometry.material_data['scattering_length'].data, gpu_geometry.material_data['reemission_prob'].data, gpu_geometry.material_data['reemission_cdf'].data, gpu_geometry.surface_data['nsurfaces'], gpu_geometry.surface_data['detect'].data, gpu_geometry.surface_data['absorb'].data, gpu_geometry.surface_data['reemit'].data, gpu_geometry.surface_data['reflect_diffuse'].data, gpu_geometry.surface_data['reflect_specular'].data, gpu_geometry.surface_data['eta'].data, gpu_geometry.surface_data['k'].data, gpu_geometry.surface_data['reemission_cdf'].data, gpu_geometry.surface_data['model'].data, gpu_geometry.surface_data['transmissive'].data, gpu_geometry.surface_data['thickness'].data, gpu_geometry.surface_data['nplanes'].data, gpu_geometry.surface_data['wire_diameter'].data, gpu_geometry.surface_data['wire_pitch'].data, g_times_l=True).wait() end_chunk = time.time() chunk_time = end_chunk - start_chunk #print "chunk time: ",chunk_time #if chunk_time>2.5: # adapt_factor *= 0.5 step += nsteps scatter_first = 0 # Only allow non-zero in first pass end_step = time.time() #print "step time: ",end_step-start_step if step < max_steps: start_requeue = time.time() #print "reset photon queues" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize( ) # ensure all threads done #temp = input_queue_gpu #input_queue_gpu = output_queue_gpu #output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) #nphotons = input_queue_gpu[:1].get()[0] - 1 # new style output_queue_gpu.get(output_queue) nphotons = output_queue[0] - 1 input_queue_gpu.set(output_queue) output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) elif api.is_gpu_api_opencl(): temp_out = output_queue_gpu.get() nphotons = temp_out[0] input_queue_gpu.set( temp_out[1:], queue=comqueue ) # set the input queue to have index of photons still need to be run output_queue_gpu[:1].set( np.ones(shape=1, dtype=np.uint32), queue=comqueue) # reset first instance to be one end_requeue = time.time() #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue if nphotons == 0: break end_prop = time.time() print "propagation time: ", end_prop - start_prop, " secs" end_flags = self.flags.get() end_flag = np.max(end_flags) if end_flag & (1 << 31): print >> sys.stderr, "WARNING: ABORTED PHOTONS" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
def propagate_hit(self, gpu_geometry, rng_states, parameters): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. got one abort:: In [1]: a = ph("hhMOCK") In [9]: f = a[:,3,2].view(np.uint32) In [12]: np.where( f & 1<<31 ) Out[12]: (array([279]),) failed to just mock that one:: RANGE=279:280 MockNuWa MOCK """ nphotons = self.pos.size nwork = nphotons nthreads_per_block = parameters['threads_per_block'] max_blocks = parameters['max_blocks'] max_steps = parameters['max_steps'] use_weights = False scatter_first = 0 self.upload_queues(nwork) solid_id_map_gpu = gpu_geometry.solid_id_map solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu small_remainder = nthreads_per_block * 16 * 8 block = (nthreads_per_block, 1, 1) results = {} results['name'] = "propagate_hit" results['nphotons'] = nphotons results['nwork'] = nwork results['nsmall'] = small_remainder results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i" step = 0 times = [] npass = 0 nabort = 0 while step < max_steps: npass += 1 if nwork < small_remainder or use_weights: nsteps = max_steps - step # Just finish the rest of the steps if the # of photons is low log.debug( "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s " % (small_remainder, nwork, nsteps, max_steps)) else: nsteps = 1 pass log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " % (nphotons, nwork, step, max_steps, nsteps)) abort = False for first_photon, photons_this_round, blocks in chunk_iterator( nwork, nthreads_per_block, max_blocks): if abort: nabort += 1 else: grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.input_queue_gpu[1:].gpudata, self.output_queue_gpu.gpudata, rng_states, self.pos.gpudata, self.dir.gpudata, self.wavelengths.gpudata, self.pol.gpudata, self.t.gpudata, self.flags.gpudata, self.last_hit_triangles.gpudata, self.weights.gpudata, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, solid_id_map_gpu.gpudata, solid_id_to_channel_id_gpu.gpudata, ) log.info( "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s " % (repr(grid), repr(block), first_photon, photons_this_round)) get_time = self.propagate_hit_kernel.prepared_timed_call( grid, block, *args) t = get_time() times.append(t) if t > self.max_time: abort = True log.warn( "kernel launch time %s > max_time %s : ABORTING " % (t, self.max_time)) pass pass pass log.info("step %s propagate_hit_kernel times %s " % (step, repr(times))) pass step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: nwork = self.swap_queues() pass pass log.info("calling max ") if ga.max(self.flags).get() & (1 << 31): log.warn("ABORTED PHOTONS") log.info("done calling max ") cuda.Context.get_current().synchronize() results['npass'] = npass results['nabort'] = nabort results['nlaunch'] = len(times) results['tottime'] = sum(times) results['maxtime'] = max(times) results['mintime'] = min(times) results[ 'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f" return results