def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) return gpu_parent_nodes.get()
def __init__(self, photons, ncopies=1, max_time=4.): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ module = get_cu_module('propagate_hit.cu', options=cuda_options) propagate_hit_kernel = module.get_function('propagate_hit') propagate_hit_kernel.prepare('iiPPPPPPPPPPPiiiPPP') self.propagate_hit_kernel = propagate_hit_kernel self.gpu_funcs = GPUFuncs(module) self.max_time = max_time self.ncopies = ncopies self.true_nphotons = len(photons) self.marshall_photons(photons, ncopies)
def concatenate_layers(layers): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) nthreads_per_block = 256 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block, max_blocks=10000): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) return nodes.get(), layer_bounds
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) return gpu_parent_nodes.get()
def concatenate_layers(layers): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) nthreads_per_block = 256 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block, max_blocks=10000): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) return nodes.get(), layer_bounds
class GPURays(object): """The GPURays class holds arrays of ray positions and directions on the GPU that are used to render a geometry.""" def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) def rotate(self, phi, n): "Rotate by an angle phi around the axis `n`." self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) def rotate_around_point(self, phi, n, point): """"Rotate by an angle phi around the axis `n` passing through the point `point`.""" self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) def translate(self, v): "Translate the ray positions by the vector `v`." self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) def render(self, gpu_geometry, pixels, alpha_depth=10, keep_last_render=False): """Render `gpu_geometry` and fill the GPU array `pixels` with pixel colors.""" if not keep_last_render: self.dxlen.fill(0) if alpha_depth > self.max_alpha_depth: raise Exception('alpha_depth > max_alpha_depth') if not isinstance(pixels, ga.GPUArray): raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) if pixels.size != self.pos.size: raise ValueError('`pixels`.size != number of rays') self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) def snapshot(self, gpu_geometry, alpha_depth=10): "Render `gpu_geometry` and return a numpy array of pixel colors." pixels = ga.empty(self.pos.size, dtype=np.uint32) self.render(gpu_geometry, pixels, alpha_depth) return pixels.get()
def __init__(self, gpu_detector, ndaq=1): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.module = get_cu_module('daq.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels
def area_sort_nodes(gpu_geometry, layer_bounds): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: bvh_funcs.area_sort_child(np.uint32(start), np.uint32(end), gpu_geometry, block=(nthreads_per_block,1,1), grid=(120,1)) return gpu_geometry.nodes.get()
def area_sort_nodes(gpu_geometry, layer_bounds): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: bvh_funcs.area_sort_child(np.uint32(start), np.uint32(end), gpu_geometry, block=(nthreads_per_block, 1, 1), grid=(120, 1)) return gpu_geometry.nodes.get()
def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
def collapse_chains(nodes, layer_bounds): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: bvh_funcs.collapse_child(np.uint32(start), np.uint32(end), gpu_nodes, block=(nthreads_per_block,1,1), grid=(120,1)) return gpu_nodes.get()
def collapse_chains(nodes, layer_bounds): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) gpu_nodes = ga.to_gpu(nodes) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: bvh_funcs.collapse_child(np.uint32(start), np.uint32(end), gpu_nodes, block=(nthreads_per_block, 1, 1), grid=(120, 1)) return gpu_nodes.get()
def __init__(self, photons, ncopies=1): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies-1), np.int32(nphotons), block=(nthreads_per_block,1,1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights): '''Create new object using slices of GPUArrays from an instance of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' self.pos = pos self.dir = dir self.pol = pol self.wavelengths = wavelengths self.t = t self.last_hit_triangles = last_hit_triangles self.flags = flags self.weights = weights module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) self.true_nphotons = len(pos) self.ncopies = 1
class GPUPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, nchannels, tbins, trange, qbins, qrange): """Setup GPU arrays to hold PDF information. nchannels: int, number of channels tbins: number of time bins trange: tuple of (min, max) time in PDF qbins: number of charge bins qrange: tuple of (min, max) charge in PDF """ self.events_in_histogram = 0 self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins), dtype=np.uint32) self.tbins = tbins self.trange = trange self.qbins = qbins self.qrange = qrange def clear_pdf(self): """Rezero the PDF counters.""" self.hitcount_gpu.fill(0) self.pdf_gpu.fill(0) def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), gpuchannels.q, gpuchannels.t, self.hitcount_gpu, np.int32(self.tbins), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.qbins), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.pdf_gpu, block=(nthreads_per_block,1,1), grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) self.events_in_histogram += 1 def get_pdfs(self): """Returns the 1D hitcount array and the 3D [channel, time, charge] histogram.""" return self.hitcount_gpu.get(), self.pdf_gpu.get() def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, trange, min_qwidth, qrange, min_bin_content=10, time_only=True): """Setup GPU arrays to compute PDF values for the given event. The pdf_eval calculation allows the PDF to be evaluated at a single point for each channel as the Monte Carlo is run. The effective bin size will be as small as (`min_twidth`, `min_qwidth`) around the point of interest, but will be large enough to ensure that `min_bin_content` Monte Carlo events fall into the bin. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. min_twidth: float Minimum bin size in the time dimension trange: (float, float) Range of time dimension in PDF min_qwidth: float Minimum bin size in charge dimension qrange: (float, float) Range of charge dimension in PDF min_bin_content: int The bin will be expanded to include at least this many events time_only: bool If True, only the time observable will be used in the PDF. """ self.event_nhit = count_nonzero(event_hit) # Define a mapping from an array of len(event_hit) to an array of length event_nhit self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32) self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id) self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset) self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content assert time_only # Only support time right now self.time_only = time_only def clear_pdf_eval(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.eval_hitcount_gpu.fill(0) self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) @profile_if_possible def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000): "Add the most recent results of run_daq() to the PDF evaluation." self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32) self.work_queues.fill(1) self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size), np.int32(gpuchannels.ndaq), self.event_hit_gpu, self.event_time_gpu, gpuchannels.t, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.min_bin_content), self.map_channel_id_to_hit_offset_gpu, self.work_queues, block=(nthreads_per_block,1,1), grid=(self.event_hit_gpu.size//nthreads_per_block+1,1)) cuda.Context.get_current().synchronize() self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit), np.int32(gpuchannels.ndaq), self.map_hit_offset_to_channel_id_gpu, self.work_queues, self.event_time_gpu, gpuchannels.t, self.nearest_mc_gpu, np.int32(self.min_bin_content), block=(nthreads_per_block,1,1), grid=(self.event_nhit,1)) cuda.Context.get_current().synchronize() def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) hitcount = self.eval_hitcount_gpu.get() bincount = self.eval_bincount_gpu.get() pdf_value = np.zeros(len(hitcount), dtype=float) pdf_frac_uncert = np.zeros_like(pdf_value) # PDF value for high stats bins high_stats = (bincount >= self.min_bin_content) if high_stats.any(): if self.time_only: pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats]) # PDF value for low stats bins low_stats = ~high_stats & (hitcount > 0) & evhit nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content)) nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32) nearest_mc.fill(1e9) nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit # Deal with the case where we did not even get min_bin_content events # in the PDF but also clamp the lower range to ensure we don't index # by a negative number in 2 lines last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] if low_stats.any(): if self.time_only: pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1) # PDFs with no stats got zero by default during array creation print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() return hitcount, pdf_value, pdf_value * pdf_frac_uncert
class GPUDaq(object): def __init__(self, gpu_detector, ndaq=1): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.module = get_cu_module('daq.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels def begin_acquire(self, nthreads_per_block=64): self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0): if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block,1,1), grid=(blocks,1)) else: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block,1,1), grid=(blocks,1)) cuda.Context.get_current().synchronize() def end_acquire(self, nthreads_per_block=64): self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1)) cuda.Context.get_current().synchronize() return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
class GPURays(object): """The GPURays class holds arrays of ray positions and directions on the GPU that are used to render a geometry.""" def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) def rotate(self, phi, n): "Rotate by an angle phi around the axis `n`." self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks, 1, 1), grid=(self.pos.size // self.nblocks + 1, 1)) self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks, 1, 1), grid=(self.dir.size // self.nblocks + 1, 1)) def rotate_around_point(self, phi, n, point): """"Rotate by an angle phi around the axis `n` passing through the point `point`.""" self.transform_funcs.rotate_around_point( np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks, 1, 1), grid=(self.pos.size // self.nblocks + 1, 1)) self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks, 1, 1), grid=(self.dir.size // self.nblocks + 1, 1)) def translate(self, v): "Translate the ray positions by the vector `v`." self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks, 1, 1), grid=(self.pos.size // self.nblocks + 1, 1)) def render(self, gpu_geometry, pixels, alpha_depth=10, keep_last_render=False): """Render `gpu_geometry` and fill the GPU array `pixels` with pixel colors.""" if not keep_last_render: self.dxlen.fill(0) if alpha_depth > self.max_alpha_depth: raise Exception('alpha_depth > max_alpha_depth') if not isinstance(pixels, ga.GPUArray): raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) if pixels.size != self.pos.size: raise ValueError('`pixels`.size != number of rays') self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks, 1, 1), grid=(self.pos.size // self.nblocks + 1, 1)) def snapshot(self, gpu_geometry, alpha_depth=10): "Render `gpu_geometry` and return a numpy array of pixel colors." pixels = ga.empty(self.pos.size, dtype=np.uint32) self.render(gpu_geometry, pixels, alpha_depth) return pixels.get()
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n / 2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n / 2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % ( i * 2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192 * 50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, Mapped(flag), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index + 1), np.uint32(better_i), nodes, block=(1, 1, 1), grid=(1, 1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
class GPUKernelPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_moments(self, nchannels, trange, qrange, time_only=True): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. trange: (float, float) Range of time dimension in PDF qrange: (float, float) Range of charge dimension in PDF time_only: bool If True, only the time observable will be used in the PDF. """ self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.trange = trange self.qrange = qrange self.time_only = time_only def clear_moments(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.hitcount_gpu.fill(0) self.tmom1_gpu.fill(0.0) self.tmom2_gpu.fill(0.0) self.qmom1_gpu.fill(0.0) self.qmom2_gpu.fill(0.0) def accumulate_moments(self, gpuchannels, nthreads_per_block=64): """Add the most recent results of run_daq() to the accumulate of moments for future bandwidth calculation.""" self.gpu_funcs.accumulate_moments(np.int32(self.time_only), np.int32(len(gpuchannels.t)), gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.hitcount_gpu, self.tmom1_gpu, self.tmom2_gpu, self.qmom1_gpu, self.qmom2_gpu, block=(nthreads_per_block,1,1), grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): """Use the MC information accumulated by accumulate_moments() to estimate the best bandwidth to use when kernel estimating.""" rho = 1.0 hitcount = self.hitcount_gpu.get() mom0 = np.maximum(hitcount, 1) tmom1 = self.tmom1_gpu.get() tmom2 = self.tmom2_gpu.get() tmean = tmom1 / mom0 tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg trms = tvar**0.5 if self.time_only: d = 1 else: d = 2 dimensionality_factor = ((4.0/(d+2)) / (mom0/scale_factor))**(-1.0/(d+4)) gaussian_density = np.minimum(1.0/trms, (1.0/np.sqrt(2.0*np.pi)) * np.exp(-0.5*((event_time - tmean)/trms)) / trms) time_bandwidths = dimensionality_factor / gaussian_density * rho inv_time_bandwidths = np.zeros_like(time_bandwidths) inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[time_bandwidths > 0] ** -1 # precompute inverse to speed up GPU evaluation self.inv_time_bandwidths_gpu = ga.to_gpu( inv_time_bandwidths.astype(np.float32) ) # Compute charge bandwidths if needed if self.time_only: self.inv_charge_bandwidths_gpu = ga.empty_like( self.inv_time_bandwidths_gpu ) self.inv_charge_bandwidths_gpu.fill(0.0) else: qmom1 = self.qmom1_gpu.get() qmom2 = self.qmom2_gpu.get() qmean = qmom1 / mom0 qrms = (qmom2 / mom0 - qmean**2)**0.5 gaussian_density = np.minimum(1.0/qrms, (1.0/np.sqrt(2.0*np.pi)) * np.exp(-0.5*((event_charge - qmean)/qrms)) / qrms) charge_bandwidths = dimensionality_factor / gaussian_density * rho # precompute inverse to speed up GPU evaluation self.inv_charge_bandwidths_gpu = ga.to_gpu( (charge_bandwidths**-1).astype(np.float32) ) def setup_kernel(self, event_hit, event_time, event_charge): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. """ self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.hitcount_gpu.fill(0) self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) def clear_kernel(self): self.hitcount_gpu.fill(0) self.time_pdf_values_gpu.fill(0.0) self.charge_pdf_values_gpu.fill(0.0) def accumulate_kernel(self, gpuchannels, nthreads_per_block=64): "Add the most recent results of run_daq() to the kernel PDF evaluation." self.gpu_funcs.accumulate_kernel_eval(np.int32(self.time_only), np.int32(len(self.event_hit_gpu)), self.event_hit_gpu, self.event_time_gpu, self.event_charge_gpu, gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.inv_time_bandwidths_gpu, self.inv_charge_bandwidths_gpu, self.hitcount_gpu, self.time_pdf_values_gpu, self.charge_pdf_values_gpu, block=(nthreads_per_block,1,1), grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) def get_kernel_eval(self): hitcount = self.hitcount_gpu.get() hit = self.event_hit_gpu.get().astype(bool) time_pdf_values = self.time_pdf_values_gpu.get() time_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero charge_pdf_values = self.charge_pdf_values_gpu.get() charge_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero if self.time_only: pdf_values = time_pdf_values else: pdf_values = time_pdf_values * charge_pdf_values return hitcount, pdf_values, np.zeros_like(pdf_values)
def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module)
def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
class GPUKernelPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_moments(self, nchannels, trange, qrange, time_only=True): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. trange: (float, float) Range of time dimension in PDF qrange: (float, float) Range of charge dimension in PDF time_only: bool If True, only the time observable will be used in the PDF. """ self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.trange = trange self.qrange = qrange self.time_only = time_only def clear_moments(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.hitcount_gpu.fill(0) self.tmom1_gpu.fill(0.0) self.tmom2_gpu.fill(0.0) self.qmom1_gpu.fill(0.0) self.qmom2_gpu.fill(0.0) def accumulate_moments(self, gpuchannels, nthreads_per_block=64): """Add the most recent results of run_daq() to the accumulate of moments for future bandwidth calculation.""" self.gpu_funcs.accumulate_moments( np.int32(self.time_only), np.int32(len(gpuchannels.t)), gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.hitcount_gpu, self.tmom1_gpu, self.tmom2_gpu, self.qmom1_gpu, self.qmom2_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): """Use the MC information accumulated by accumulate_moments() to estimate the best bandwidth to use when kernel estimating.""" rho = 1.0 hitcount = self.hitcount_gpu.get() mom0 = np.maximum(hitcount, 1) tmom1 = self.tmom1_gpu.get() tmom2 = self.tmom2_gpu.get() tmean = tmom1 / mom0 tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg trms = tvar**0.5 if self.time_only: d = 1 else: d = 2 dimensionality_factor = ((4.0 / (d + 2)) / (mom0 / scale_factor))**(-1.0 / (d + 4)) gaussian_density = np.minimum( 1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_time - tmean) / trms)) / trms) time_bandwidths = dimensionality_factor / gaussian_density * rho inv_time_bandwidths = np.zeros_like(time_bandwidths) inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[ time_bandwidths > 0]**-1 # precompute inverse to speed up GPU evaluation self.inv_time_bandwidths_gpu = ga.to_gpu( inv_time_bandwidths.astype(np.float32)) # Compute charge bandwidths if needed if self.time_only: self.inv_charge_bandwidths_gpu = ga.empty_like( self.inv_time_bandwidths_gpu) self.inv_charge_bandwidths_gpu.fill(0.0) else: qmom1 = self.qmom1_gpu.get() qmom2 = self.qmom2_gpu.get() qmean = qmom1 / mom0 qrms = (qmom2 / mom0 - qmean**2)**0.5 gaussian_density = np.minimum( 1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms) charge_bandwidths = dimensionality_factor / gaussian_density * rho # precompute inverse to speed up GPU evaluation self.inv_charge_bandwidths_gpu = ga.to_gpu( (charge_bandwidths**-1).astype(np.float32)) def setup_kernel(self, event_hit, event_time, event_charge): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. """ self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.hitcount_gpu.fill(0) self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) def clear_kernel(self): self.hitcount_gpu.fill(0) self.time_pdf_values_gpu.fill(0.0) self.charge_pdf_values_gpu.fill(0.0) def accumulate_kernel(self, gpuchannels, nthreads_per_block=64): "Add the most recent results of run_daq() to the kernel PDF evaluation." self.gpu_funcs.accumulate_kernel_eval( np.int32(self.time_only), np.int32(len(self.event_hit_gpu)), self.event_hit_gpu, self.event_time_gpu, self.event_charge_gpu, gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.inv_time_bandwidths_gpu, self.inv_charge_bandwidths_gpu, self.hitcount_gpu, self.time_pdf_values_gpu, self.charge_pdf_values_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) def get_kernel_eval(self): hitcount = self.hitcount_gpu.get() hit = self.event_hit_gpu.get().astype(bool) time_pdf_values = self.time_pdf_values_gpu.get() time_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero charge_pdf_values = self.charge_pdf_values_gpu.get() charge_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero if self.time_only: pdf_values = time_pdf_values else: pdf_values = time_pdf_values * charge_pdf_values return hitcount, pdf_values, np.zeros_like(pdf_values)
def merge_nodes(nodes, degree, max_ratio=None): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree) gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
class GPUPhotons(object): def __init__(self, photons, ncopies=1): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies-1), np.int32(nphotons), block=(nthreads_per_block,1,1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def get_hits(self, gpu_detector, target_flag=(0x1<<2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a map of GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, channels, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos),3)) dir = dir.get().view(np.float32).reshape((len(dir),3)) pol = pol.get().view(np.float32).reshape((len(pol),3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() channels = channels.get() hitmap = {} for chan in np.unique(channels): mask = (channels == chan).astype(bool) hitmap[chan] = event.Photons(pos[mask], dir[mask], pol[mask], wavelengths[mask], t[mask], last_hit_triangles[mask], flags[mask], weights[mask]) return hitmap def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in xrange(self.ncopies): window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) yield GPUPhotonsSlice(pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window]) @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) while step < max_steps: # Just finish the rest of the steps if the # of photons is low if nphotons < nthreads_per_block * 16 * 8 or use_weights: nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" cuda.Context.get_current().synchronize() @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size
def merge_nodes(nodes, degree, max_ratio=None): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1)#degree) gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index+nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area/parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple((degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index+nchild:] = new_parent_nodes[index+1:-nchild+1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][new_parent_index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
class GPUPhotons(object): def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() evidx = self.evidx.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) def get_hits(self, *args, **kwargs): '''Return a map of GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' flat_hits = self.get_flat_hits(*args, **kwargs) hitmap = {} for chan in np.unique(flat_hits.channel): mask = (flat_hits.channel == chan).astype(bool) hitmap[int(chan)] = flat_hits[mask] return hitmap def get_flat_hits(self, gpu_detector, target_flag=(0x1 << 2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, no_map=False): '''GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits( np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, channels, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos), 3)) dir = dir.get().view(np.float32).reshape((len(dir), 3)) pol = pol.get().view(np.float32).reshape((len(pol), 3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() evidx = evidx.get() channels = channels.get() hitmap = {} return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx, channels) def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in range(self.ncopies): window = slice(self.true_nphotons * i, self.true_nphotons * (i + 1)) yield GPUPhotonsSlice( pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window], evidx=self.evidx[window]) @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, track=False): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in range(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) if track: step_photon_ids = [] step_photons = [] #save the first step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) while step < max_steps: # Just finish the rest of the steps if the # of photons is low and not tracking if not track and (nphotons < nthreads_per_block * 16 * 8 or use_weights): nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) if track: #save the next step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if nphotons == 0: break if ga.max(self.flags).get() & (1 << 31): print("WARNING: ABORTED PHOTONS", file=sys.stderr) cuda.Context.get_current().synchronize() if track: return step_photon_ids, step_photons @profile_if_possible def copy_queue(self, queue_gpu, nphotons, nthreads_per_block=64, max_blocks=1024, start_photon=0): # Allocate new storage space pos = ga.empty(shape=nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=nphotons, dtype=np.float32) t = ga.empty(shape=nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=nphotons, dtype=np.int32) flags = ga.empty(shape=nphotons, dtype=np.uint32) weights = ga.empty(shape=nphotons, dtype=np.float32) evidx = ga.empty(shape=nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if nphotons > 0: for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_queue( np.int32(start_photon + first_photon), np.int32(photons_this_round), queue_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
class GPUPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, nchannels, tbins, trange, qbins, qrange): """Setup GPU arrays to hold PDF information. nchannels: int, number of channels tbins: number of time bins trange: tuple of (min, max) time in PDF qbins: number of charge bins qrange: tuple of (min, max) charge in PDF """ self.events_in_histogram = 0 self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins), dtype=np.uint32) self.tbins = tbins self.trange = trange self.qbins = qbins self.qrange = qrange def clear_pdf(self): """Rezero the PDF counters.""" self.hitcount_gpu.fill(0) self.pdf_gpu.fill(0) def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): self.gpu_funcs.bin_hits( np.int32(len(self.hitcount_gpu)), gpuchannels.q, gpuchannels.t, self.hitcount_gpu, np.int32(self.tbins), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.qbins), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.pdf_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) self.events_in_histogram += 1 def get_pdfs(self): """Returns the 1D hitcount array and the 3D [channel, time, charge] histogram.""" return self.hitcount_gpu.get(), self.pdf_gpu.get() def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, trange, min_qwidth, qrange, min_bin_content=10, time_only=True): """Setup GPU arrays to compute PDF values for the given event. The pdf_eval calculation allows the PDF to be evaluated at a single point for each channel as the Monte Carlo is run. The effective bin size will be as small as (`min_twidth`, `min_qwidth`) around the point of interest, but will be large enough to ensure that `min_bin_content` Monte Carlo events fall into the bin. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. min_twidth: float Minimum bin size in the time dimension trange: (float, float) Range of time dimension in PDF min_qwidth: float Minimum bin size in charge dimension qrange: (float, float) Range of charge dimension in PDF min_bin_content: int The bin will be expanded to include at least this many events time_only: bool If True, only the time observable will be used in the PDF. """ self.event_nhit = count_nonzero(event_hit) # Define a mapping from an array of len(event_hit) to an array of length event_nhit self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype( np.uint32) self.map_hit_offset_to_channel_id_gpu = ga.to_gpu( self.map_hit_offset_to_channel_id) self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) self.map_channel_id_to_hit_offset_gpu = ga.to_gpu( self.map_channel_id_to_hit_offset) self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content assert time_only # Only support time right now self.time_only = time_only def clear_pdf_eval(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.eval_hitcount_gpu.fill(0) self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) @profile_if_possible def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000): "Add the most recent results of run_daq() to the PDF evaluation." self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq + 1), dtype=np.uint32) self.work_queues.fill(1) self.gpu_funcs.accumulate_bincount( np.int32(self.event_hit_gpu.size), np.int32(gpuchannels.ndaq), self.event_hit_gpu, self.event_time_gpu, gpuchannels.t, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.min_bin_content), self.map_channel_id_to_hit_offset_gpu, self.work_queues, block=(nthreads_per_block, 1, 1), grid=(self.event_hit_gpu.size // nthreads_per_block + 1, 1)) cuda.Context.get_current().synchronize() self.gpu_funcs.accumulate_nearest_neighbor_block( np.int32(self.event_nhit), np.int32(gpuchannels.ndaq), self.map_hit_offset_to_channel_id_gpu, self.work_queues, self.event_time_gpu, gpuchannels.t, self.nearest_mc_gpu, np.int32(self.min_bin_content), block=(nthreads_per_block, 1, 1), grid=(self.event_nhit, 1)) cuda.Context.get_current().synchronize() def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) hitcount = self.eval_hitcount_gpu.get() bincount = self.eval_bincount_gpu.get() pdf_value = np.zeros(len(hitcount), dtype=float) pdf_frac_uncert = np.zeros_like(pdf_value) # PDF value for high stats bins high_stats = (bincount >= self.min_bin_content) if high_stats.any(): if self.time_only: pdf_value[high_stats] = bincount[high_stats].astype( float) / hitcount[high_stats] / self.min_twidth else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[high_stats] = 1.0 / np.sqrt(bincount[high_stats]) # PDF value for low stats bins low_stats = ~high_stats & (hitcount > 0) & evhit nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape( (self.event_nhit, self.min_bin_content)) nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32) nearest_mc.fill(1e9) nearest_mc[self.map_hit_offset_to_channel_id, :] = nearest_mc_by_hit # Deal with the case where we did not even get min_bin_content events # in the PDF but also clamp the lower range to ensure we don't index # by a negative number in 2 lines last_valid_entry = np.maximum( 0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)), last_valid_entry] if low_stats.any(): if self.time_only: pdf_value[low_stats] = ( last_valid_entry[low_stats] + 1).astype(float) / hitcount[ low_stats] / distance[low_stats] / 2.0 else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[low_stats] = 1.0 / np.sqrt( last_valid_entry[low_stats] + 1) # PDFs with no stats got zero by default during array creation print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() return hitcount, pdf_value, pdf_value * pdf_frac_uncert
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n/2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n/2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192*50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, Mapped(flag), block=(nthreads_per_block,1,1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i), nodes, block=(1,1,1), grid=(1,1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
class GPUPhotonsHit(object): def __init__(self, photons, ncopies=1, max_time=4.): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ module = get_cu_module('propagate_hit.cu', options=cuda_options) propagate_hit_kernel = module.get_function('propagate_hit') propagate_hit_kernel.prepare('iiPPPPPPPPPPPiiiPPP') self.propagate_hit_kernel = propagate_hit_kernel self.gpu_funcs = GPUFuncs(module) self.max_time = max_time self.ncopies = ncopies self.true_nphotons = len(photons) self.marshall_photons(photons, ncopies) def marshall_photons_npl(self, npl): pass def marshall_photons(self, photons, ncopies): """ Assign the provided photons to the beginning (possibly the entire array if ncopies is 1 """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 block = (nthreads_per_block, 1, 1) for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): pass grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), ) self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid) pass pass def get(self, npl=0, hit=0): log.info("get npl:%d hit:%d " % (npl, hit)) pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() if npl: nall = len(pos) a = np.zeros((nall, 4, 4), dtype=np.float32) a[:, 0, :3] = pos a[:, 0, 3] = t a[:, 1, :3] = dir a[:, 1, 3] = wavelengths a[:, 2, :3] = pol a[:, 2, 3] = weights assert len(last_hit_triangles) == len(flags) pmtid = np.zeros(nall, dtype=np.int32) # a kludge setting of pmtid into lht using the map argument of propagate_hit.cu SURFACE_DETECT = 0x1 << 2 detected = np.where(flags & SURFACE_DETECT) pmtid[detected] = last_hit_triangles[ detected] # sparsely populate, leaving zeros for undetected a[:, 3, 0] = np.arange(nall, dtype=np.int32).view(a.dtype) # photon_id a[:, 3, 1] = 0 # used in comparison againt vbo prop a[:, 3, 2] = flags.view(a.dtype) # history flags a[:, 3, 3] = pmtid.view(a.dtype) # channel_id ie PmtId if hit: return a[pmtid > 0].view(NPY) else: return a.view(NPY) pass else: # the old way return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in xrange(self.ncopies): window = slice(self.true_nphotons * i, self.true_nphotons * (i + 1)) yield GPUPhotonsSlice( pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window]) def upload_queues(self, nwork): """ # Order photons initially in the queue to put the clones next to each other #. input_queue starts as [0,0,1,2,3,.....,nwork] #. output_queue starts as [1,0,0,0,0,....] """ input_queue = np.empty(shape=nwork + 1, dtype=np.uint32) input_queue[0] = 0 for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons output_queue = np.zeros(shape=nwork + 1, dtype=np.uint32) output_queue[0] = 1 self.input_queue_gpu = ga.to_gpu(input_queue) self.output_queue_gpu = ga.to_gpu(output_queue) def swap_queues(self): """ Swaps queues and returns photons remaining to propagate #. output_queue[0] = 1 initially, this avoids enqueued photon_id stomping on output_queue[0] as atomicAdd returns the non-incremented:: 230 // Not done, put photon in output queue 231 if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB | NAN_ABORT)) == 0) 232 { // pulling queue ticket 233 int out_idx = atomicAdd(output_queue, 1); // atomic add 1 to slot zero value, returns non-incremented original value 234 output_queue[out_idx] = photon_id; 235 } #. At kernel tail non-completed photon threads enqueue their photon_id into a slot in the output_queue. The slot to use is obtained by atomic incrementing output_queue[0], ensuring orderly queue. #. after kernel completes output_queue[0] contains the number of photon_id enqued in output_queue[1:] """ temp = self.input_queue_gpu self.input_queue_gpu = self.output_queue_gpu self.output_queue_gpu = temp self.output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) slot0minus1 = self.input_queue_gpu[:1].get( )[0] - 1 # which was just now the output_queue before swap log.debug("swap_queues slot0minus1 %s " % slot0minus1) return slot0minus1 @profile_if_possible def propagate_hit(self, gpu_geometry, rng_states, parameters): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. got one abort:: In [1]: a = ph("hhMOCK") In [9]: f = a[:,3,2].view(np.uint32) In [12]: np.where( f & 1<<31 ) Out[12]: (array([279]),) failed to just mock that one:: RANGE=279:280 MockNuWa MOCK """ nphotons = self.pos.size nwork = nphotons nthreads_per_block = parameters['threads_per_block'] max_blocks = parameters['max_blocks'] max_steps = parameters['max_steps'] use_weights = False scatter_first = 0 self.upload_queues(nwork) solid_id_map_gpu = gpu_geometry.solid_id_map solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu small_remainder = nthreads_per_block * 16 * 8 block = (nthreads_per_block, 1, 1) results = {} results['name'] = "propagate_hit" results['nphotons'] = nphotons results['nwork'] = nwork results['nsmall'] = small_remainder results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i" step = 0 times = [] npass = 0 nabort = 0 while step < max_steps: npass += 1 if nwork < small_remainder or use_weights: nsteps = max_steps - step # Just finish the rest of the steps if the # of photons is low log.debug( "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s " % (small_remainder, nwork, nsteps, max_steps)) else: nsteps = 1 pass log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " % (nphotons, nwork, step, max_steps, nsteps)) abort = False for first_photon, photons_this_round, blocks in chunk_iterator( nwork, nthreads_per_block, max_blocks): if abort: nabort += 1 else: grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.input_queue_gpu[1:].gpudata, self.output_queue_gpu.gpudata, rng_states, self.pos.gpudata, self.dir.gpudata, self.wavelengths.gpudata, self.pol.gpudata, self.t.gpudata, self.flags.gpudata, self.last_hit_triangles.gpudata, self.weights.gpudata, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, solid_id_map_gpu.gpudata, solid_id_to_channel_id_gpu.gpudata, ) log.info( "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s " % (repr(grid), repr(block), first_photon, photons_this_round)) get_time = self.propagate_hit_kernel.prepared_timed_call( grid, block, *args) t = get_time() times.append(t) if t > self.max_time: abort = True log.warn( "kernel launch time %s > max_time %s : ABORTING " % (t, self.max_time)) pass pass pass log.info("step %s propagate_hit_kernel times %s " % (step, repr(times))) pass step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: nwork = self.swap_queues() pass pass log.info("calling max ") if ga.max(self.flags).get() & (1 << 31): log.warn("ABORTED PHOTONS") log.info("done calling max ") cuda.Context.get_current().synchronize() results['npass'] = npass results['nabort'] = nabort results['nlaunch'] = len(times) results['tottime'] = sum(times) results['maxtime'] = max(times) results['mintime'] = min(times) results[ 'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f" return results @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsHitSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size