示例#1
0
    def __init__(self,
                 gpu_detector,
                 ntdcs=None,
                 ns_per_tdc=None,
                 adc_bits=None,
                 ndaq=1,
                 cl_context=None,
                 cl_queue=None):
        """constructor.
        
        Args:
          gpu_detector: GPUDetector
        Keywords:
          ntdcs: int
            number of time bins per channel
            if not supplied, using class variable value
          ns_per_tdc: float
            nanoseconds per time bin
            if not supplied, using class variable value
          adc_bits:  int
            number of ADC bits (not used yet)
          ndaq: int
            number of daqs
          cl_context: pyopencl.Context
          cl_queue: pyopencl.CommandQueue
        Raises:
          ValueError when ntdcs and ns_per_tdc are found to be NoneType
        """
        if ntdcs == None:
            self.ntdcs = GPUDaqLAr1ND.NTDC
        if ns_per_tdc == None:
            self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC
        super(GPUDaqLAr1ND, self).__init__(gpu_detector,
                                           ntdcs=self.ntdcs,
                                           ns_per_tdc=self.ns_per_tdc,
                                           adc_bits=adc_bits,
                                           ndaq=ndaq,
                                           cl_context=cl_context,
                                           cl_queue=cl_queue)
        if self.ntdcs == None:
            raise ValueError("GPUDaqLAr1ND.NTDC has not been set.")
        if self.ns_per_tdc == None:
            raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.")

        kernel_filepath = os.path.dirname(
            os.path.realpath(__file__)) + "/daq_lar1nd"
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module(kernel_filepath + ".cu",
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module(kernel_filepath + '.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.gpu_funcs = GPUFuncs(self.module)
示例#2
0
 def __init__(self, cl_context=None):
     if api.is_gpu_api_cuda():
         self.module = cutools.get_cu_module('pdf.cu',
                                             options=api_options,
                                             include_source_directory=True)
     elif api.is_gpu_api_opencl():
         self.module = cltools.get_cl_module('pdf.cl',
                                             cl_context,
                                             options=api_options,
                                             include_source_directory=True)
     self.gpu_funcs = GPUFuncs(self.module)
示例#3
0
 def setUp(self):
     self.context = cltools.get_last_context()
     self.nthreads_per_block = 256
     self.myoptions = ('-I.', ) + api_options
     self.mod = get_module("test_sample_cdf.cl",
                           self.context,
                           options=self.myoptions,
                           include_source_directory=True)
     self.funcs = GPUFuncs(self.mod)
     self.rng_states = clrand.get_rng_states(self.context,
                                             self.nthreads_per_block)
     self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")
示例#4
0
def area_sort_nodes(gpu_geometry, layer_bounds):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.area_sort_child(np.uint32(start),
                                  np.uint32(end),
                                  gpu_geometry,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(120, 1))
    return gpu_geometry.nodes.get()
示例#5
0
def fill_array(context, rng_states, size):
    queue = cl.CommandQueue(context)
    out_gpu = cl.array.empty(queue, size, dtype=np.float32)
    randmod = get_cl_module("random.cl",
                            context,
                            options=cl_options,
                            include_source_directory=True)
    randfuncs = GPUFuncs(randmod)
    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in chunk_iterator(
            size, nthreads_per_block, max_blocks=1):
        randfuncs.fillArray(queue, (elements_this_iter, 1, 1), None,
                            np.uint32(first_index), rng_states.data,
                            out_gpu.data)
    out = out_gpu.get()
    return out
示例#6
0
    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels
示例#7
0
    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
                 flags, weights):
        '''Create new object using slices of GPUArrays from an instance
        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
        self.pos = pos
        self.dir = dir
        self.pol = pol
        self.wavelengths = wavelengths
        self.t = t
        self.last_hit_triangles = last_hit_triangles
        self.flags = flags
        self.weights = weights

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        self.true_nphotons = len(pos)
        self.ncopies = 1
示例#8
0
def collapse_chains(nodes, layer_bounds):
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA or OpenCL')

    bvh_funcs = GPUFuncs(bvh_module)

    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.collapse_child(np.uint32(start),
                                     np.uint32(end),
                                     gpu_nodes,
                                     block=(nthreads_per_block, 1, 1),
                                     grid=(120, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.collapse_child(queue, (end - start, 1, 1), None,
                                     np.uint32(start), np.uint32(end),
                                     gpu_nodes.data).wait()

    return gpu_nodes.get()
示例#9
0
class GPUPhotons(object):
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies

    def define_texture_references(self, module=None):
        # unbound texture references declared for use with propagate
        if module == None:
            module = self.module
        if api.is_gpu_api_cuda():
            self.node_texture_ref = module.get_texref("nodevec_tex_ref")
            self.node_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32,
                                             4)

            self.extra_node_texture_ref = module.get_texref(
                "extra_node_tex_ref")
            self.extra_node_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.vertices_texture_ref = module.get_texref(
                "verticesvec_tex_ref")
            self.vertices_texture_ref.set_format(cuda.array_format.FLOAT, 4)

            self.triangles_texture_ref = module.get_texref(
                "trianglesvec_tex_ref")
            self.triangles_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.node_texture_ref_bound = False
        elif api.is_gpu_api_opencl():
            # texture usage not used at the moment
            pass

    def get(self):
        ncols = 3
        if api.is_gpu_api_opencl():
            ncols = 4  # must include padding
        pos = self.pos.get().view(np.float32).reshape((len(self.pos), ncols))
        dir = self.dir.get().view(np.float32).reshape((len(self.dir), ncols))
        pol = self.pol.get().view(np.float32).reshape((len(self.pol), ncols))
        wavelengths = self.wavelengths.get()
        t = self.t.get()
        last_hit_triangles = self.last_hit_triangles.get()
        flags = self.flags.get()
        weights = self.weights.get()
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights)

    def iterate_copies(self):
        '''Returns an iterator that yields GPUPhotonsSlice objects
        corresponding to the event copies stored in ``self``.'''
        for i in xrange(self.ncopies):
            window = slice(self.true_nphotons * i,
                           self.true_nphotons * (i + 1))
            yield GPUPhotonsSlice(
                pos=self.pos[window],
                dir=self.dir[window],
                pol=self.pol[window],
                wavelengths=self.wavelengths[window],
                t=self.t[window],
                last_hit_triangles=self.last_hit_triangles[window],
                flags=self.flags[window],
                weights=self.weights[window])

    @profile_if_possible
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  cl_context=None):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        # bind node texture reference
        if api.is_gpu_api_cuda() and not self.node_texture_ref_bound:
            # we have to unroll, as pycuda doesn't seem to support vector times right now for binding
            self.unrolled_nodes = ga.to_gpu(
                gpu_geometry.nodes.get().ravel().view(np.uint32))
            self.unrolled_extra_nodes = ga.to_gpu(
                gpu_geometry.extra_nodes.ravel().view(np.uint32))
            self.unrolled_triangles = ga.to_gpu(
                gpu_geometry.triangles.get().ravel().view(np.uint32))
            self.unrolled_triangles4 = ga.to_gpu(
                gpu_geometry.triangles4.ravel().view(np.uint32))
            self.unrolled_vertices = ga.to_gpu(
                gpu_geometry.vertices.get().ravel().view(np.float32))
            self.unrolled_vertices4 = ga.to_gpu(
                gpu_geometry.vertices4.ravel().view(np.float32))
            self.node_texture_ref.set_address(self.unrolled_nodes.gpudata,
                                              self.unrolled_nodes.nbytes)
            self.extra_node_texture_ref.set_address(
                self.unrolled_extra_nodes.gpudata,
                self.unrolled_extra_nodes.nbytes)
            #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref )
            #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref )
            #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref )
            self.triangles_texture_ref.set_address(
                self.unrolled_triangles4.gpudata,
                self.unrolled_triangles4.nbytes)
            #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref )
            self.vertices_texture_ref.set_address(
                self.unrolled_vertices4.gpudata,
                self.unrolled_vertices4.nbytes)
            print "[BOUND TO TEXTURE MEMORY]"
            print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes"
            print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes"
            print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes"
            print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes"
            print "Total: ", (self.unrolled_nodes.nbytes +
                              self.unrolled_extra_nodes.nbytes +
                              self.unrolled_triangles4.nbytes +
                              self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes"
            self.node_texture_ref_bound = True

        # setup queue
        maxqueue = nphotons
        step = 0
        input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        if api.is_gpu_api_cuda():
            input_queue_gpu = ga.to_gpu(input_queue)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            input_queue_gpu = ga.to_device(comqueue,
                                           input_queue[1:])  # why the offset?

        output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32)
        output_queue[0] = 1
        if api.is_gpu_api_cuda():
            output_queue_gpu = ga.to_gpu(output_queue)
        elif api.is_gpu_api_opencl():
            output_queue_gpu = ga.to_device(comqueue, output_queue)

        if use_weights:
            iuse_weights = 1
        else:
            iuse_weights = 0

        adapt_factor = 1.0
        start_prop = time.time()
        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            #if nphotons < nthreads_per_block * 16 * 8 or use_weights:
            #    nsteps = max_steps - step
            #else:
            #    nsteps = 1
            nsteps = 1

            start_step = time.time()
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )):
                #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor
                start_chunk = time.time()
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.propagate(np.int32(first_photon),
                                             np.int32(photons_this_round),
                                             input_queue_gpu[1:],
                                             output_queue_gpu,
                                             rng_states,
                                             self.pos,
                                             self.dir,
                                             self.wavelengths,
                                             self.pol,
                                             self.t,
                                             self.flags,
                                             self.last_hit_triangles,
                                             self.weights,
                                             np.int32(nsteps),
                                             np.int32(iuse_weights),
                                             np.int32(scatter_first),
                                             gpu_geometry.gpudata,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
                    #cuda.Context.get_current().synchronize()
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.propagate(
                        comqueue, (photons_this_round, 1, 1),
                        None,
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        input_queue_gpu.data,
                        output_queue_gpu.data,
                        rng_states.data,
                        self.pos.data,
                        self.dir.data,
                        self.wavelengths.data,
                        self.pol.data,
                        self.t.data,
                        self.flags.data,
                        self.last_hit_triangles.data,
                        self.weights.data,
                        np.int32(nsteps),
                        np.int32(iuse_weights),
                        np.int32(scatter_first),
                        gpu_geometry.world_scale,
                        gpu_geometry.world_origin.data,
                        np.int32(len(gpu_geometry.nodes)),
                        gpu_geometry.material_data['n'],
                        gpu_geometry.material_data['step'],
                        gpu_geometry.material_data["wavelength0"],
                        gpu_geometry.vertices.data,
                        gpu_geometry.triangles.data,
                        gpu_geometry.material_codes.data,
                        gpu_geometry.colors.data,
                        gpu_geometry.nodes.data,
                        gpu_geometry.extra_nodes.data,
                        gpu_geometry.material_data["nmaterials"],
                        gpu_geometry.material_data['refractive_index'].data,
                        gpu_geometry.material_data['absorption_length'].data,
                        gpu_geometry.material_data['scattering_length'].data,
                        gpu_geometry.material_data['reemission_prob'].data,
                        gpu_geometry.material_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['nsurfaces'],
                        gpu_geometry.surface_data['detect'].data,
                        gpu_geometry.surface_data['absorb'].data,
                        gpu_geometry.surface_data['reemit'].data,
                        gpu_geometry.surface_data['reflect_diffuse'].data,
                        gpu_geometry.surface_data['reflect_specular'].data,
                        gpu_geometry.surface_data['eta'].data,
                        gpu_geometry.surface_data['k'].data,
                        gpu_geometry.surface_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['model'].data,
                        gpu_geometry.surface_data['transmissive'].data,
                        gpu_geometry.surface_data['thickness'].data,
                        gpu_geometry.surface_data['nplanes'].data,
                        gpu_geometry.surface_data['wire_diameter'].data,
                        gpu_geometry.surface_data['wire_pitch'].data,
                        g_times_l=True).wait()
                end_chunk = time.time()
                chunk_time = end_chunk - start_chunk
                #print "chunk time: ",chunk_time
                #if chunk_time>2.5:
                #    adapt_factor *= 0.5
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            end_step = time.time()
            #print "step time: ",end_step-start_step

            if step < max_steps:
                start_requeue = time.time()
                #print "reset photon queues"
                if api.is_gpu_api_cuda():
                    cuda.Context.get_current().synchronize(
                    )  # ensure all threads done
                    #temp = input_queue_gpu
                    #input_queue_gpu = output_queue_gpu
                    #output_queue_gpu = temp
                    # Assign with a numpy array of length 1 to silence
                    # warning from PyCUDA about setting array with different strides/storage orders.
                    #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                    #nphotons = input_queue_gpu[:1].get()[0] - 1
                    # new style
                    output_queue_gpu.get(output_queue)
                    nphotons = output_queue[0] - 1
                    input_queue_gpu.set(output_queue)
                    output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))

                elif api.is_gpu_api_opencl():
                    temp_out = output_queue_gpu.get()
                    nphotons = temp_out[0]
                    input_queue_gpu.set(
                        temp_out[1:], queue=comqueue
                    )  # set the input queue to have index of photons still need to be run
                    output_queue_gpu[:1].set(
                        np.ones(shape=1, dtype=np.uint32),
                        queue=comqueue)  # reset first instance to be one
                end_requeue = time.time()
                #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue
                if nphotons == 0:
                    break

        end_prop = time.time()
        print "propagation time: ", end_prop - start_prop, " secs"
        end_flags = self.flags.get()
        end_flag = np.max(end_flags)
        if end_flag & (1 << 31):
            print >> sys.stderr, "WARNING: ABORTED PHOTONS"
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    @profile_if_possible
    def select(self,
               target_flag,
               nthreads_per_block=64,
               max_blocks=1024,
               start_photon=None,
               nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon + first_photon),
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu,
                                         self.flags,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon +
                                                     first_photon),
                                            np.int32(photons_this_round),
                                            np.uint32(target_flag),
                                            index_counter_gpu,
                                            self.pos,
                                            self.dir,
                                            self.wavelengths,
                                            self.pol,
                                            self.t,
                                            self.flags,
                                            self.last_hit_triangles,
                                            self.weights,
                                            pos,
                                            dir,
                                            wavelengths,
                                            pol,
                                            t,
                                            flags,
                                            last_hit_triangles,
                                            weights,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t,
                               last_hit_triangles, flags, weights)

    def __del__(self):
        del self.pos
        del self.dir
        del self.pol
        del self.wavelengths
        del self.t
        del self.flags
        del self.last_hit_triangles
        # Free up GPU memory quickly if now available
        gc.collect()

    def __len__(self):
        return self.pos.size
示例#10
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      cutools.Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
示例#11
0
def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds
示例#12
0
def merge_nodes(nodes, degree, max_ratio=None):
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # determine number of parents
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
        gpu_nodes = ga.to_device(queue, nodes)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # run kernel
    if gpuapi.is_gpu_api_cuda():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
            bvh_funcs.make_parents(np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree),
                                   gpu_parent_nodes,
                                   cuda.In(nodes),
                                   np.uint32(0),
                                   np.uint32(len(nodes)),
                                   block=(nthreads_per_block, 1, 1),
                                   grid=(nblocks_this_iter, 1))
    elif gpuapi.is_gpu_api_opencl():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=1):
            bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None,
                                   np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree), gpu_parent_nodes.data,
                                   gpu_nodes.data, np.uint32(0),
                                   np.uint32(len(nodes))).wait()
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
示例#13
0
    def __init__(self,
                 geometry,
                 wavelengths=None,
                 print_usage=False,
                 min_free_gpu_mem=300e6,
                 cl_context=None,
                 cl_queue=None):
        log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem)

        self.geometry = geometry
        self.instance_count += 1
        assert self.instance_count == 1, traceback.print_stack()

        self.metadata = Metadata()
        self.metadata(None, 'preinfo')
        self.metadata('a', "start")
        self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem

        if wavelengths is None:
            self.wavelengths = standard_wavelengths
        else:
            self.wavelengths = wavelengths

        try:
            self.wavelength_step = np.unique(np.diff(self.wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        # this is where things get difficult.
        # pycuda and pyopencl gives us very different methods for working with structs
        #geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        # Note, that unfortunately the data types returned are very different as the
        if api.is_gpu_api_cuda():
            self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
            self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
        elif api.is_gpu_api_opencl():
            self.material_data, materials_bytes_cl = self._package_material_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)
            self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)

        self.metadata('b', "after materials,surfaces")
        if api.is_gpu_api_opencl():
            self.metadata[
                'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl  # opencl, we have to track this ourselves

        # Load Vertices and Triangles
        if api.is_gpu_api_cuda():
            self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                         dtype=ga.vec.float3,
                                         write_combined=True)
            self.vertices4 = np.zeros(shape=(len(self.vertices), 4),
                                      dtype=np.float32)
            self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                          dtype=ga.vec.uint3,
                                          write_combined=True)
            self.triangles4 = np.zeros(shape=(len(self.triangles), 4),
                                       dtype=np.uint32)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.vertices4[:, :-1] = self.vertices.ravel().view(
                np.float32).reshape(len(self.vertices), 3)  # for textures
            self.triangles[:] = to_uint3(geometry.mesh.triangles)
            self.triangles4[:, :-1] = self.triangles.ravel().view(
                np.uint32).reshape(len(self.triangles), 3)  # for textures
        elif api.is_gpu_api_opencl():
            self.vertices = ga.empty(cl_queue,
                                     len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3)
            self.triangles = ga.empty(cl_queue,
                                      len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.triangles[:] = to_uint3(geometry.mesh.triangles)

        if api.is_gpu_api_cuda():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
        elif api.is_gpu_api_opencl():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
            #self.world_origin = geometry.bvh.world_coords.world_origin
            self.world_origin = ga.to_device(cl_queue, self.world_origin)
            print type(self.world_origin), self.world_origin
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        # Load material and surface indices into 8-bit codes
        # check if we've reached a complexity threshold
        if len(geometry.unique_materials) >= int(0xff):
            raise ValueError(
                'Number of materials to index has hit maximum of %d' %
                (int(0xff)))
        if len(geometry.unique_surfaces) >= int(0xff):
            raise ValueError(
                'Number of surfaces to index has hit maximum of %d' %
                (int(0xff)))
        # make bit code
        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(
                              np.uint32)
        if api.is_gpu_api_cuda():
            self.material_codes = ga.to_gpu(material_codes)
        elif api.is_gpu_api_opencl():
            self.material_codes = ga.to_device(cl_queue, material_codes)

        # assign color codes
        colors = geometry.colors.astype(np.uint32)
        if api.is_gpu_api_cuda():
            self.colors = ga.to_gpu(colors)
            self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
        elif api.is_gpu_api_opencl():
            self.colors = ga.to_device(cl_queue, colors)
            self.solid_id_map = ga.to_device(
                cl_queue, geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        self.metadata('c', "after colors, idmap")
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes
            self.metadata[
                'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl
            gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl +
                                    meshdef_nbytes_cl)

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(
                int((gpu_free - min_free_gpu_mem) /
                    geometry.bvh.nodes.itemsize), 100), n_nodes)
        print "split index=", split_index, " vs. total nodes=", n_nodes

        # push nodes to GPU
        if api.is_gpu_api_cuda():
            self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        elif api.is_gpu_api_opencl():
            self.nodes = ga.to_device(cl_queue,
                                      geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index))  # forbid zero size

        # left over nodes
        if api.is_gpu_api_cuda():
            self.extra_nodes = mapped_empty(shape=n_extra,
                                            dtype=geometry.bvh.nodes.dtype,
                                            write_combined=True)
        elif api.is_gpu_api_opencl():
            self.extra_nodes = ga.empty(cl_queue,
                                        shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype)

        if split_index < n_nodes:
            log.info('Splitting BVH between GPU and CPU memory at node %d' %
                     split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]
            splitting = 1
        else:
            splitting = 0

        self.metadata('d', "after nodes")
        if api.is_gpu_api_opencl():
            nodes_nbytes_cl = self.nodes.nbytes
            self.metadata[
                'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl
        self.metadata.array("d_nodes", geometry.bvh.nodes)
        self.metadata['d_split_index'] = split_index
        self.metadata['d_extra_nodes_count'] = n_extra
        self.metadata['d_splitting'] = splitting
        self.print_device_usage(cl_context=cl_context)

        # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']
        self.metadata.array('e_triangles', self.triangles)
        if api.is_gpu_api_cuda():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                self.triangles = ga.to_gpu(self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                #self.triangles = ga.to_device(cl_queue,self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0

        self.metadata('e', "after triangles")
        self.metadata['e_triangles_gpu'] = ftriangles_gpu

        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']

        self.metadata.array('f_vertices', self.vertices)

        if api.is_gpu_api_cuda():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                #self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0

        self.metadata('f', "after vertices")
        self.metadata['f_vertices_gpu'] = vertices_gpu

        if api.is_gpu_api_cuda():
            geometry_source = cutools.get_cu_source('geometry_types.h')
            geometry_struct_size = characterize.sizeof('Geometry',
                                                       geometry_source)
            self.gpudata = make_gpu_struct(geometry_struct_size, [
                Mapped(self.vertices),
                Mapped(self.triangles), self.material_codes, self.colors,
                self.nodes,
                Mapped(self.extra_nodes), self.material_pointer_array,
                self.surface_pointer_array, self.world_origin,
                self.world_scale,
                np.int32(len(self.nodes))
            ])
        elif api.is_gpu_api_opencl():
            # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays
            # We then build a geometry struct later in the kernel
            # provided below is example/test of passing the data
            #if True: # for debuggin
            if False:  #
                print "loading geometry_structs.cl"
                geostructsmod = cltools.get_cl_module(
                    "geometry_structs.cl",
                    cl_context,
                    options=cltools.cl_options,
                    include_source_directory=True)
                geostructsfunc = GPUFuncs(geostructsmod)
                geostructsfunc.make_geostruct(
                    cl_queue, (3, ), None, self.vertices.data,
                    self.triangles.data, self.material_codes.data,
                    self.colors.data, self.nodes.data, self.extra_nodes.data,
                    np.int32(len(geometry.unique_materials)),
                    self.material_data['refractive_index'].data,
                    self.material_data['absorption_length'].data,
                    self.material_data['scattering_length'].data,
                    self.material_data['reemission_prob'].data,
                    self.material_data['reemission_cdf'].data,
                    np.int32(len(geometry.unique_surfaces)),
                    self.surface_data['detect'].data,
                    self.surface_data['absorb'].data,
                    self.surface_data['reemit'].data,
                    self.surface_data['reflect_diffuse'].data,
                    self.surface_data['reflect_specular'].data,
                    self.surface_data['eta'].data, self.surface_data['k'].data,
                    self.surface_data['reemission_cdf'].data,
                    self.surface_data['model'].data,
                    self.surface_data['transmissive'].data,
                    self.surface_data['thickness'].data,
                    self.surface_data['nplanes'].data,
                    self.surface_data['wire_diameter'].data,
                    self.surface_data['wire_pitch'].data,
                    self.world_origin.data, self.world_scale,
                    np.int32(len(self.nodes)), self.material_data['n'],
                    self.material_data['step'],
                    self.material_data["wavelength0"])
                cl_queue.finish()
                self.material_codes.get()
                raise RuntimeError('bail')
        if print_usage:
            self.print_device_usage(cl_context=cl_context)
        log.info(self.device_usage_str(cl_context=cl_context))
        self.metadata('g', "after geometry struct")
示例#14
0
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
示例#15
0
origin = geo.bvh.world_coords.world_origin

nodes = sim.gpu_geometry.nodes
extra_node = sim.gpu_geometry.extra_nodes
triangles = sim.gpu_geometry.triangles
vertices = sim.gpu_geometry.vertices
print vertices.shape
vertices4 = np.zeros((len(vertices), 4), dtype=np.float32)
print vertices.get().ravel().view(np.float32).shape
vertices4[:, :-1] = vertices.get().ravel().view(np.float32).reshape(
    len(vertices), 3)

module = get_module('test_texture.cu',
                    options=api_options,
                    include_source_directory=True)
gpu_funcs = GPUFuncs(module)
node_texture_ref = module.get_texref("node_tex_ref")
extra_node_texture_ref = module.get_texref("extra_node_tex_ref")
triangles_texture_ref = module.get_texref("triangles_tex_ref")
vertices_texture_ref = module.get_texref("vertices_tex_ref")

node_vec_texture_ref = module.get_texref("nodevec_tex_ref")
node_vec_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4)

ur_nodes = nodes.get().ravel().view(np.uint32)
ur_nodes_gpu = ga.to_gpu(ur_nodes)
ur_nodes_gpu.bind_to_texref_ext(node_texture_ref)
nodes_nbytes = ur_nodes.nbytes

ur_nodes = nodes.get().ravel().view(np.uint32)
ur_nodes_vec_gpu = ga.to_gpu(ur_nodes)
示例#16
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()
示例#17
0
class GPUPDF(object):
    def __init__(self, cl_context=None):
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module('pdf.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module('pdf.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
        """Setup GPU arrays to hold PDF information.

           nchannels: int, number of channels
           tbins: number of time bins
           trange: tuple of (min, max) time in PDF
           qbins: number of charge bins
           qrange: tuple of (min, max) charge in PDF
        """
        self.events_in_histogram = 0
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins),
                                dtype=np.uint32)
        self.tbins = tbins
        self.trange = trange
        self.qbins = qbins
        self.qrange = qrange

    def clear_pdf(self):
        """Rezero the PDF counters."""
        self.hitcount_gpu.fill(0)
        self.pdf_gpu.fill(0)

    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
        self.gpu_funcs.bin_hits(
            np.int32(len(self.hitcount_gpu)),
            gpuchannels.q,
            gpuchannels.t,
            self.hitcount_gpu,
            np.int32(self.tbins),
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.int32(self.qbins),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.pdf_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

        self.events_in_histogram += 1

    def get_pdfs(self):
        """Returns the 1D hitcount array and the 3D [channel, time, charge]
        histogram."""
        return self.hitcount_gpu.get(), self.pdf_gpu.get()

    def setup_pdf_eval(self,
                       event_hit,
                       event_time,
                       event_charge,
                       min_twidth,
                       trange,
                       min_qwidth,
                       qrange,
                       min_bin_content=10,
                       time_only=True):
        """Setup GPU arrays to compute PDF values for the given event.
        The pdf_eval calculation allows the PDF to be evaluated at a
        single point for each channel as the Monte Carlo is run.  The
        effective bin size will be as small as (`min_twidth`,
        `min_qwidth`) around the point of interest, but will be large
        enough to ensure that `min_bin_content` Monte Carlo events
        fall into the bin.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.

            min_twidth: float
              Minimum bin size in the time dimension
            trange: (float, float)
              Range of time dimension in PDF
            min_qwidth: float
              Minimum bin size in charge dimension
            qrange: (float, float)
              Range of charge dimension in PDF
            min_bin_content: int
              The bin will be expanded to include at least this many events
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.event_nhit = count_nonzero(event_hit)

        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(
            np.uint32)
        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(
            self.map_hit_offset_to_channel_id)
        self.map_channel_id_to_hit_offset = np.maximum(0,
                                                       event_hit.cumsum() -
                                                       1).astype(np.uint32)
        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(
            self.map_channel_id_to_hit_offset)

        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))

        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content,
                                       dtype=np.float32)
        self.nearest_mc_gpu.fill(1e9)

        self.min_twidth = min_twidth
        self.trange = trange
        self.min_qwidth = min_qwidth
        self.qrange = qrange
        self.min_bin_content = min_bin_content

        assert time_only  # Only support time right now
        self.time_only = time_only

    def clear_pdf_eval(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.eval_hitcount_gpu.fill(0)
        self.eval_bincount_gpu.fill(0)
        self.nearest_mc_gpu.fill(1e9)

    @profile_if_possible
    def accumulate_pdf_eval(self,
                            gpuchannels,
                            nthreads_per_block=64,
                            max_blocks=10000,
                            cl_queue=None):
        "Add the most recent results of run_daq() to the PDF evaluation."
        if api.is_gpu_api_cuda():
            self.work_queues = ga.empty(shape=self.event_nhit *
                                        (gpuchannels.ndaq + 1),
                                        dtype=np.uint32)
        elif api.is_gpu_api_opencl():
            self.work_queues = ga.empty(cl_queue,
                                        shape=self.event_nhit *
                                        (gpuchannels.ndaq + 1),
                                        dtype=np.uint32)
        self.work_queues.fill(1)

        if api.is_gpu_api_cuda():
            self.gpu_funcs.accumulate_bincount(
                np.int32(self.event_hit_gpu.size),
                np.int32(gpuchannels.ndaq),
                self.event_hit_gpu,
                self.event_time_gpu,
                gpuchannels.t,
                self.eval_hitcount_gpu,
                self.eval_bincount_gpu,
                np.float32(self.min_twidth),
                np.float32(self.trange[0]),
                np.float32(self.trange[1]),
                np.int32(self.min_bin_content),
                self.map_channel_id_to_hit_offset_gpu,
                self.work_queues,
                block=(nthreads_per_block, 1, 1),
                grid=(self.event_hit_gpu.size // nthreads_per_block + 1, 1))
            cuda.Context.get_current().synchronize()

            self.gpu_funcs.accumulate_nearest_neighbor_block(
                np.int32(self.event_nhit),
                np.int32(gpuchannels.ndaq),
                self.map_hit_offset_to_channel_id_gpu,
                self.work_queues,
                self.event_time_gpu,
                gpuchannels.t,
                self.nearest_mc_gpu,
                np.int32(self.min_bin_content),
                block=(nthreads_per_block, 1, 1),
                grid=(self.event_nhit, 1))
            cuda.Context.get_current().synchronize()

        elif api.is_gpu_api_opencl():
            self.gpu_funcs.accumulate_bincount(
                cl_queue, (nthreads_per_block, 1, 1),
                (self.event_hit_gpu.size // nthreads_per_block + 1, 1),
                np.int32(gpuchannels.ndaq),
                self.event_hit_gpu.data,
                self.event_time_gpu.data,
                gpuchannels.t.data,
                self.eval_hitcount_gpu.data,
                self.eval_bincount_gpu.data,
                np.float32(self.min_twidth),
                np.float32(self.trange[0]),
                np.float32(self.trange[1]),
                np.int32(self.min_bin_content),
                self.map_channel_id_to_hit_offset_gpu.data,
                self.work_queues.data,
                g_times_l=True)
            #cl.enqueue_barrier( cl_queue )
            self.gpu_funcs.accumulate_nearest_neighbor_block(
                cl_queue, (nthreads_per_block, 1, 1), (self.event_nhit, 1),
                np.int32(self.event_nhit),
                np.int32(gpuchannels.ndaq),
                self.map_hit_offset_to_channel_id_gpu.data,
                self.work_queues.data,
                self.event_time_gpu.daa,
                gpuchannels.t.data,
                self.nearest_mc_gpu.data,
                np.int32(self.min_bin_content),
                g_time_l=True)
            #cl.enqueue_barrier( cl_queue )

    def get_pdf_eval(self):
        evhit = self.event_hit_gpu.get().astype(bool)
        hitcount = self.eval_hitcount_gpu.get()
        bincount = self.eval_bincount_gpu.get()

        pdf_value = np.zeros(len(hitcount), dtype=float)
        pdf_frac_uncert = np.zeros_like(pdf_value)

        # PDF value for high stats bins
        high_stats = (bincount >= self.min_bin_content)
        if high_stats.any():
            if self.time_only:
                pdf_value[high_stats] = bincount[high_stats].astype(
                    float) / hitcount[high_stats] / self.min_twidth
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[high_stats] = 1.0 / np.sqrt(bincount[high_stats])

        # PDF value for low stats bins
        low_stats = ~high_stats & (hitcount > 0) & evhit

        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape(
            (self.event_nhit, self.min_bin_content))
        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content),
                              dtype=np.float32)
        nearest_mc.fill(1e9)
        nearest_mc[self.map_hit_offset_to_channel_id, :] = nearest_mc_by_hit

        # Deal with the case where we did not even get min_bin_content events
        # in the PDF but also clamp the lower range to ensure we don't index
        # by a negative number in 2 lines
        last_valid_entry = np.maximum(
            0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
        distance = nearest_mc[np.arange(len(last_valid_entry)),
                              last_valid_entry]
        if low_stats.any():
            if self.time_only:
                pdf_value[low_stats] = (
                    last_valid_entry[low_stats] + 1).astype(float) / hitcount[
                        low_stats] / distance[low_stats] / 2.0
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[low_stats] = 1.0 / np.sqrt(
                last_valid_entry[low_stats] + 1)

        # PDFs with no stats got zero by default during array creation

        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
        return hitcount, pdf_value, pdf_value * pdf_frac_uncert
示例#18
0
    def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                            nodes, workgroupsize, comqueue):
        module = get_module('wq_checknode.cl',
                            self.context,
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_device(
            comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_device(
            comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_device(comqueue,
                                   sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_device(
            comqueue, sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_device(comqueue,
                                    sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin_gpu
        world_scale = gpugeo.world_scale
        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)] = np.arange(0,
                                                           len(photons.pos),
                                                           dtype=np.int32)[:]
        queue_photon_index[len(photons.pos):] = (
            np.ones(len(photons.pos), dtype=np.int32) * -1)[:]
        queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos),
                                                      dtype=np.int32)[:]
        a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)
        workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)
        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(comqueue, 1, dtype=np.int32)
        node_front_end = ga.empty(comqueue, 1, dtype=np.int32)
        workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1))
        workgroup_daughter = cl.LocalMemory(c.nbytes *
                                            (max_nodes_can_store + 1))
        workgroup_sibling = cl.LocalMemory(c.nbytes *
                                           (max_nodes_can_store + 1))
        workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1))
        max_loops = 32

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node

        start_queue = time.time()
        gpu_funcs.checknode(
            comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1),
            np.int32(max_loops), photon_pos.data, photon_dir.data,
            photon_current_node.data,
            photon_tested_node.data, photon_last_result.data,
            np.int32(len(nodes)), nodes.data, node_parent.data,
            node_first_daughter.data, node_sibling.data, node_aunt.data,
            world_origin.data, world_scale, queue_size,
            queue_photon_index.data, queue_slot_flag.data,
            np.int32(len(photon_pos)), np.int32(workgroupsize),
            workgroup_photons, workgroup_current_node, workgroup_tested_node,
            max_nodes_can_store, workgroup_nodes, workgroup_daughter,
            workgroup_sibling, workgroup_aunt, loaded_node_start_index,
            loaded_node_end_index, node_front_start.data,
            node_front_end.data).wait()
        end_queue = time.time()

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test, result)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        print photon_last_result.get()

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 32):
            y = x + 32
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 32):
            y = x + 32
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
        return
示例#19
0
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
示例#20
0
    def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1,
                 seed=None, cl_context=None):
        """
        Generates photons from information in the steps_arr
        
        Parameters
        ----------
        steps_arr : numpy.array with shape=(N,10) dtype=np.float
           contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ]
           in the future could generalize this to many different time components.
           developed for liquid argon TPCs.
        multiple : float
           scale up the number of photons generated (not implemented yet)
        """
        self.steps_array = steps_arr
        self.nsteps = self.steps_array.shape[0]
        if multiple!=1.0:
            raise RuntimeError('Have not implemented scaling of the number of photons generated.')

        # ===========================
        # GEN PHOTONS
        tstart_genphotons =  time.time()
        # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here)
        # on the CPU, we scan the steps to determine the total number of photons using poisson statistics
        # we assume the user has seeded the random number generator to her liking
        tstart_nphotons = time.time()
        self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 )
        #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int )
        self.nphotons_per_step = self.steps_array[ self._nphotons, : ]
        self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() )
        print "NSTEPS: ",self.nsteps
        print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons
        # now we make an index array for which step we need to get info from
        self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 )
        current_index=0
        for n, n_per_step in enumerate( self.nphotons_per_step ):
            self.source_step_index[current_index:current_index+n_per_step] = n
            current_index += n_per_step
        # push everything to the GPU
        tstart_transfer = time.time()
        if api.is_gpu_api_cuda():
            # step info
            self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio )
            self.source_step_index_gpu = ga.to_gpu( self.source_step_index )
            # photon info
            self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) )
            self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
        elif api.is_gpu_api_opencl():
            cl_queue = cl.CommandQueue( cl_context )
            # step info
            self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu  = ga.to_device( cl_queue, self.step_fsratio )
            self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index )
            # photon info
            self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
        
        self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) )
        self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) )
        self.t.set( self.steps_array[:,3] )
        self.ncopies = ncopies
        self.true_nphotons = self.nphotons

        if self.ncopies!=1:
            raise ValueError('support for multiple copies not supported')

        if api.is_gpu_api_cuda():
            self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True )
        elif api.is_gpu_api_opencl():
            self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True )
        self.gpufuncs = GPUFuncs( self.gpumod )
        print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer

        # need random numbers
        tgpu = time.time()
        if seed==None:
            seed = 5
        rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context)
        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks):
            if api.is_gpu_api_cuda():
                self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu,
                                                    self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states,
                                                    self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights,
                                                    block=(nthreads_per_block,1,1), grid=(blocks, 1) )
            elif api.is_gpu_api_opencl():
                self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None,
                                                    np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data,
                                                    self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states.data,
                                                    self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, 
                                                    self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait()
                                                    
            else:
                raise RuntimeError("GPU API is neither CUDA nor OpenCL!")
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        tend_genphotons =  time.time()
        print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")"

        # Now load modules
        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu', options=api_options, include_source_directory=True)
        elif  api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)
示例#21
0
class TestSampling(unittest.TestCase):
    def setUp(self):
        self.context = cltools.get_last_context()
        self.nthreads_per_block = 256
        self.myoptions = ('-I.', ) + api_options
        self.mod = get_module("test_sample_cdf.cl",
                              self.context,
                              options=self.myoptions,
                              include_source_directory=True)
        self.funcs = GPUFuncs(self.mod)
        self.rng_states = clrand.get_rng_states(self.context,
                                                self.nthreads_per_block)
        self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")

    def compare_sampling(self, hist, reps=10):
        queue = cl.CommandQueue(self.context)

        # make cdf histogram
        nbins = hist.GetNbinsX()
        xaxis = hist.GetXaxis()
        intg = hist.GetIntegral()
        cdf_y = np.empty(nbins + 1, dtype=float)
        cdf_x = np.empty_like(cdf_y)

        cdf_x[0] = xaxis.GetBinLowEdge(1)
        cdf_y[0] = 0.0
        for i in xrange(1, len(cdf_x)):
            cdf_y[i] = intg[i]
            cdf_x[i] = xaxis.GetBinUpEdge(i)

        cdf_x_gpu = cl.array.to_device(queue, cdf_x.astype(np.float32))
        cdf_y_gpu = cl.array.to_device(queue, cdf_y.astype(np.float32))
        block = (self.nthreads_per_block, 1, 1)
        grid = (1, 1)
        out_gpu = cl.array.empty(queue,
                                 shape=self.nthreads_per_block,
                                 dtype=np.float32)

        out_h = rt.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(),
                        xaxis.GetXmax())
        out_h.SetLineColor(rt.kGreen)

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(reps, self.nthreads_per_block, max_blocks=1):
            self.funcs.test_sample_cdf(queue, (elements_this_iter, 1, 1), None,
                                       self.rng_states.data,
                                       np.int32(len(cdf_x_gpu)),
                                       cdf_x_gpu.data, cdf_y_gpu.data,
                                       out_gpu.data)
            out = out_gpu.get()
            for v in out[:elements_this_iter]:
                out_h.Fill(v)

        prob = out_h.KolmogorovTest(hist)
        out_h.Write()
        return prob, out_h

    def test_sampling(self):
        '''Verify that the CDF-based sampler on the GPU reproduces a binned
        Gaussian distribution'''
        f = rt.TF1('f_gaussian', 'gaus(0)', -5, 5)
        f.SetParameters(1.0 / np.sqrt(np.pi * 2), 0.0, 1.0)
        gaussian = rt.TH1D('gaussian', '', 100, -5, 5)
        gaussian.Add(f)

        prob, out_h = self.compare_sampling(gaussian, reps=20000)

        self.outf.cd()
        gaussian.Write("gaussian")
        out_h.Write("out_h")
        assert prob > 0.01

    def tearDown(self):
        self.outf.Close()
示例#22
0
class GPUDaq(object):
    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels

    def begin_acquire(self, nthreads_per_block=64, cl_context=None):
        if api.is_gpu_api_cuda():
            self.gpu_funcs.reset_earliest_time_int(
                np.float32(1e9),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                      1, 1))
            self.channel_q_int_gpu.fill(0)
            self.channel_q_gpu.fill(0)
            self.channel_history_gpu.fill(0)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            self.gpu_funcs.reset_earliest_time_int(
                comqueue, (nthreads_per_block, 1, 1),
                (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1),
                np.float32(1e9),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu.data,
                g_times_l=True).wait()
            self.channel_q_int_gpu.fill(0, queue=comqueue)
            self.channel_q_gpu.fill(0, queue=comqueue)
            self.channel_history_gpu.fill(0, queue=comqueue)
            cl.enqueue_barrier(comqueue)

    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(0x1 << 2),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.earliest_time_int_gpu,
                                           self.channel_q_int_gpu,
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1)
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round / nthreads_per_block, 1, 1),
                        (nthreads_per_block, 1, 1),
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.float32(weight),
                        g_times_l=True).wait()

        else:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        if api.is_gpu_api_cuda():
            self.gpu_funcs.convert_sortable_int_to_float(
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu,
                self.earliest_time_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                      1, 1))
            self.gpu_funcs.convert_charge_int_to_float(
                self.detector_gpu,
                self.channel_q_int_gpu,
                self.channel_q_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.channel_q_int_gpu) // nthreads_per_block + 1,
                      1))
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            print cl_context, nthreads_per_block
            comqueue = cl.CommandQueue(cl_context)
            self.gpu_funcs.convert_sortable_int_to_float(
                comqueue, (len(self.earliest_time_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu.data,
                self.earliest_time_gpu.data,
                g_times_l=True).wait()
            self.gpu_funcs.convert_charge_int_to_float(
                comqueue, (len(self.channel_q_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                self.detector_gpu.nchannels,
                self.detector_gpu.charge_unit,
                self.channel_q_int_gpu.data,
                self.channel_q_gpu.data,
                g_times_l=True).wait()

        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)
示例#23
0
class GPUDaqLAr1ND(GPUDAQHist):
    """ DAQ that stores histogram of photon hits."""
    NTDC = None
    NS_PER_TDC = None

    def __init__(self,
                 gpu_detector,
                 ntdcs=None,
                 ns_per_tdc=None,
                 adc_bits=None,
                 ndaq=1,
                 cl_context=None,
                 cl_queue=None):
        """constructor.
        
        Args:
          gpu_detector: GPUDetector
        Keywords:
          ntdcs: int
            number of time bins per channel
            if not supplied, using class variable value
          ns_per_tdc: float
            nanoseconds per time bin
            if not supplied, using class variable value
          adc_bits:  int
            number of ADC bits (not used yet)
          ndaq: int
            number of daqs
          cl_context: pyopencl.Context
          cl_queue: pyopencl.CommandQueue
        Raises:
          ValueError when ntdcs and ns_per_tdc are found to be NoneType
        """
        if ntdcs == None:
            self.ntdcs = GPUDaqLAr1ND.NTDC
        if ns_per_tdc == None:
            self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC
        super(GPUDaqLAr1ND, self).__init__(gpu_detector,
                                           ntdcs=self.ntdcs,
                                           ns_per_tdc=self.ns_per_tdc,
                                           adc_bits=adc_bits,
                                           ndaq=ndaq,
                                           cl_context=cl_context,
                                           cl_queue=cl_queue)
        if self.ntdcs == None:
            raise ValueError("GPUDaqLAr1ND.NTDC has not been set.")
        if self.ns_per_tdc == None:
            raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.")

        kernel_filepath = os.path.dirname(
            os.path.realpath(__file__)) + "/daq_lar1nd"
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module(kernel_filepath + ".cu",
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module(kernel_filepath + '.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.gpu_funcs = GPUFuncs(self.module)

    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        """run UBooNE DAQ acquire kernels"""
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        # We loop over all photons and bin them essentially
        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(event.SURFACE_DETECT),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.adc_gpu,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           np.float32(self.ns_per_tdc),
                                           np.float32(100.0),
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round, 1, 1),
                        None,
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(nphotons),
                        gpuphotons.t.data,
                        gpuphotons.pos.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        # ---------------------
                        self.uint_adc_gpu.data,
                        np.int32(self.nchannels),
                        np.int32(self.ntdcs),
                        np.float32(self.ns_per_tdc),
                        np.float32(100.0),
                        self.channel_history_gpu.data,
                        # -- Channel transforms --
                        self.channel_inverse_rot_gpu.data,
                        self.channel_inverse_trans_gpu.data,
                        # ------------------------
                        np.float32(weight),
                        g_times_l=False).wait()
            # if opencl, need to convert ADC from uint to float
            if api.is_gpu_api_opencl():
                self.gpu_funcs.convert_adc(comqueue,
                                           (int(self.nchannels), 1, 1),
                                           None,
                                           self.uint_adc_gpu.data,
                                           self.adc_gpu.data,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           g_times_l=False).wait()

        else:
            raise RunTimeError("Multi-DAQ not built")
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        """collect daq info and make GPUChannels instance.
        
        Args:
          nthreads_per_block: int
          cl_context: pyopenc.Context
        Returns:
          GPUChannels
        """
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.zeros(self.nchannels, dtype=np.float32)
            nblocks = int(self.nchannels / nthreads_per_block) + 1
            self.gpu_funcs.get_earliest_hit_time(np.int32(self.nchannels),
                                                 np.int32(self.ntdcs),
                                                 np.float32(self.ns_per_tdc),
                                                 self.adc_gpu,
                                                 self.channel_history_gpu,
                                                 self.earliest_time_gpu,
                                                 block=(1000, 1, 1),
                                                 grid=(1, 1))
            self.adc_gpu.get()
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            self.earliest_time_gpu = ga.zeros(comqueue,
                                              self.nchannels,
                                              dtype=np.float32)
            self.gpu_funcs.get_earliest_hit_time(
                comqueue, (int(self.nchannels), 1, 1), None,
                np.int32(self.nchannels), np.int32(self.ntdcs),
                np.float32(self.ns_per_tdc), self.adc_gpu.data,
                self.channel_history_gpu.data,
                self.earliest_time_gpu.data).wait()
            self.adc_gpu.get()

        return GPUChannels(self.earliest_time_gpu, self.adc_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)

    @classmethod
    def build_daq(cls, gpu_geometry, cl_context=None, cl_queue=None):
        """factory method.

        will be called by chroma.Simulation to build DAQ instance.
        Returns:
          GPUDaqLAr1ND instance
        """
        return GPUDaqLAr1ND(gpu_geometry,
                            cl_context=cl_context,
                            cl_queue=cl_queue)
示例#24
0
    def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                          nodes, workgroupsize):
        module = get_module('wq_checknode.cu',
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_gpu(
            1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_gpu(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_gpu(
            sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin
        world_scale = gpugeo.world_scale

        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)].set(
            np.arange(0, len(photons.pos), dtype=np.int32)[:])
        queue_photon_index[len(photons.pos):].set(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        queue_slot_flag[0:len(photons.pos)].set(
            np.ones(len(photons.pos), dtype=np.int32)[:])
        a = ga.zeros(1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)

        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(1, dtype=np.int32)
        node_front_end = ga.empty(1, dtype=np.int32)

        max_loops = 1000

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node
        print "STARTING QUEUE"
        print queue_photon_index

        start_queue = time.time()
        gpu_funcs.checknode(np.int32(max_loops),
                            photon_pos,
                            photon_dir,
                            photon_current_node,
                            photon_tested_node,
                            photon_last_result,
                            np.int32(len(nodes)),
                            nodes,
                            node_parent,
                            node_first_daughter,
                            node_sibling,
                            node_aunt,
                            world_origin,
                            world_scale,
                            queue_size,
                            queue_photon_index,
                            queue_slot_flag,
                            np.int32(len(photon_pos)),
                            max_nodes_can_store,
                            loaded_node_start_index,
                            loaded_node_end_index,
                            node_front_start,
                            node_front_end,
                            block=(workgroupsize, 1, 1),
                            grid=(1, 1),
                            shared=4 *
                            (7 * max_nodes_can_store + 3 * workgroupsize + 1))
        cuda.Context.get_current().synchronize()
        end_queue = time.time()

        nactive = len(np.argwhere(queue_slot_flag.get() == 1))

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        np_photon_results = photon_last_result.get()
        for x in xrange(0, len(np_photon_results), 10):
            y = x + 10
            if y > len(np_photon_results):
                y = len(np_photon_results)
            print x, ": ", np_photon_results[x:y]

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 10):
            y = x + 10
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS: ", nactive, " threads"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 10):
            y = x + 10
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
示例#25
0
class GPUKernelPDF(object):
    def __init__(self, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module('pdf.cu',
                                                options=cutools.cuda_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module('pdf.cl',
                                                cl_context,
                                                options=cltools.cl_options,
                                                include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_moments(self, nchannels, trange, qrange, time_only=True):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            trange: (float, float)
              Range of time dimension in PDF
            qrange: (float, float)
              Range of charge dimension in PDF
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32)

        self.trange = trange
        self.qrange = qrange
        self.time_only = time_only

    def clear_moments(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.hitcount_gpu.fill(0)
        self.tmom1_gpu.fill(0.0)
        self.tmom2_gpu.fill(0.0)
        self.qmom1_gpu.fill(0.0)
        self.qmom2_gpu.fill(0.0)

    def accumulate_moments(self, gpuchannels, nthreads_per_block=64):
        """Add the most recent results of run_daq() to the accumulate of 
        moments for future bandwidth calculation."""
        self.gpu_funcs.accumulate_moments(
            np.int32(self.time_only),
            np.int32(len(gpuchannels.t)),
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.hitcount_gpu,
            self.tmom1_gpu,
            self.tmom2_gpu,
            self.qmom1_gpu,
            self.qmom2_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def compute_bandwidth(self,
                          event_hit,
                          event_time,
                          event_charge,
                          scale_factor=1.0):
        """Use the MC information accumulated by accumulate_moments() to
        estimate the best bandwidth to use when kernel estimating."""

        rho = 1.0

        hitcount = self.hitcount_gpu.get()
        mom0 = np.maximum(hitcount, 1)
        tmom1 = self.tmom1_gpu.get()
        tmom2 = self.tmom2_gpu.get()

        tmean = tmom1 / mom0
        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0)  # roundoff can go neg
        trms = tvar**0.5

        if self.time_only:
            d = 1
        else:
            d = 2
        dimensionality_factor = ((4.0 / (d + 2)) /
                                 (mom0 / scale_factor))**(-1.0 / (d + 4))
        gaussian_density = np.minimum(
            1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) *
            np.exp(-0.5 * ((event_time - tmean) / trms)) / trms)
        time_bandwidths = dimensionality_factor / gaussian_density * rho
        inv_time_bandwidths = np.zeros_like(time_bandwidths)
        inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[
            time_bandwidths > 0]**-1

        # precompute inverse to speed up GPU evaluation
        self.inv_time_bandwidths_gpu = ga.to_gpu(
            inv_time_bandwidths.astype(np.float32))

        # Compute charge bandwidths if needed
        if self.time_only:
            self.inv_charge_bandwidths_gpu = ga.empty_like(
                self.inv_time_bandwidths_gpu)
            self.inv_charge_bandwidths_gpu.fill(0.0)
        else:
            qmom1 = self.qmom1_gpu.get()
            qmom2 = self.qmom2_gpu.get()

            qmean = qmom1 / mom0
            qrms = (qmom2 / mom0 - qmean**2)**0.5

            gaussian_density = np.minimum(
                1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) *
                np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms)

            charge_bandwidths = dimensionality_factor / gaussian_density * rho

            # precompute inverse to speed up GPU evaluation
            self.inv_charge_bandwidths_gpu = ga.to_gpu(
                (charge_bandwidths**-1).astype(np.float32))

    def setup_kernel(self, event_hit, event_time, event_charge):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.
        """
        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)
        self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)

    def clear_kernel(self):
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu.fill(0.0)
        self.charge_pdf_values_gpu.fill(0.0)

    def accumulate_kernel(self, gpuchannels, nthreads_per_block=64):
        "Add the most recent results of run_daq() to the kernel PDF evaluation."
        self.gpu_funcs.accumulate_kernel_eval(
            np.int32(self.time_only),
            np.int32(len(self.event_hit_gpu)),
            self.event_hit_gpu,
            self.event_time_gpu,
            self.event_charge_gpu,
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.inv_time_bandwidths_gpu,
            self.inv_charge_bandwidths_gpu,
            self.hitcount_gpu,
            self.time_pdf_values_gpu,
            self.charge_pdf_values_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def get_kernel_eval(self):
        hitcount = self.hitcount_gpu.get()
        hit = self.event_hit_gpu.get().astype(bool)
        time_pdf_values = self.time_pdf_values_gpu.get()
        time_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        charge_pdf_values = self.charge_pdf_values_gpu.get()
        charge_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        if self.time_only:
            pdf_values = time_pdf_values
        else:
            pdf_values = time_pdf_values * charge_pdf_values

        return hitcount, pdf_values, np.zeros_like(pdf_values)