Exemplo n.º 1
0
    def __init__(self, photons, ncopies=1):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
                                                self.flags, self.last_hit_triangles, self.weights,
                                                np.int32(ncopies-1), 
                                                np.int32(nphotons),
                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))


        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Exemplo n.º 2
0
    def marshall_photons(self, photons, ncopies):
        """
        Assign the provided photons to the beginning (possibly
        the entire array if ncopies is 1
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)

        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            block = (nthreads_per_block, 1, 1)
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nphotons, nthreads_per_block, max_blocks):
                pass
                grid = (blocks, 1)
                args = (
                    np.int32(first_photon),
                    np.int32(photons_this_round),
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    np.int32(ncopies - 1),
                    np.int32(nphotons),
                )
                self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid)
            pass
        pass
Exemplo n.º 3
0
def test_rotate():
    n = nthreads_per_block * blocks

    a = np.random.rand(n, 3).astype(np.float32)
    t = np.random.rand(n).astype(np.float32) * 2 * np.pi
    w = normalize(np.random.rand(3))

    a_gpu = ga.to_gpu(to_float3(a))
    t_gpu = ga.to_gpu(t)

    dest_gpu = ga.empty(n, dtype=ga.vec.float3)

    t0 = time.time()
    rotate_gpu(a_gpu,
               t_gpu,
               ga.vec.make_float3(*w),
               dest_gpu,
               block=(nthreads_per_block, 1, 1),
               grid=(blocks, 1))
    autoinit.context.synchronize()
    elapsed = time.time() - t0

    print('elapsed %f sec' % elapsed)

    r = rotate(a, t, w)

    assert np.allclose(r,
                       dest_gpu.get().view(np.float32).reshape((-1, 3)),
                       atol=1e-5)
Exemplo n.º 4
0
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
Exemplo n.º 5
0
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
Exemplo n.º 6
0
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Exemplo n.º 7
0
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
Exemplo n.º 8
0
    def __init__(self, geometry, wavelengths=None, times=None, print_usage=False, min_free_gpu_mem=300e6):
        if wavelengths is None:
            wavelengths = standard_wavelengths

        try:
            wavelength_step = np.unique(np.diff(wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')
            
        if times is None:
            time_step = 0.05
            times = np.arange(0,1000,time_step)
        else:
            try:
                time_step = np.unique(np.diff(times)).item()
            except ValueError:
                raise ValueError('times must be equally spaced apart.')

        geometry_source = get_cu_source('geometry_types.h')
        material_struct_size = characterize.sizeof('Material', geometry_source)
        surface_struct_size = characterize.sizeof('Surface', geometry_source)
        dichroicprops_struct_size = characterize.sizeof('DichroicProps', geometry_source)
        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        self.material_data = []
        self.material_ptrs = []

        def interp_material_property(wavelengths, property):
            # note that it is essential that the material properties be
            # interpolated linearly. this fact is used in the propagation
            # code to guarantee that probabilities still sum to one.
            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)

        for i in range(len(geometry.unique_materials)):
            material = geometry.unique_materials[i]

            if material is None:
                raise Exception('one or more triangles is missing a material.')

            refractive_index = interp_material_property(wavelengths, material.refractive_index)
            refractive_index_gpu = ga.to_gpu(refractive_index)
            absorption_length = interp_material_property(wavelengths, material.absorption_length)
            absorption_length_gpu = ga.to_gpu(absorption_length)
            scattering_length = interp_material_property(wavelengths, material.scattering_length)
            scattering_length_gpu = ga.to_gpu(scattering_length)
            num_comp = len(material.comp_reemission_prob)
            comp_reemission_prob_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_prob]
            self.material_data.append(comp_reemission_prob_gpu)
            comp_reemission_prob_gpu = np.uint64(0) if len(comp_reemission_prob_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_prob_gpu), comp_reemission_prob_gpu)
            assert num_comp == len(material.comp_reemission_wvl_cdf), 'component arrays must be same length'
            comp_reemission_wvl_cdf_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_wvl_cdf]
            self.material_data.append(comp_reemission_wvl_cdf_gpu)
            comp_reemission_wvl_cdf_gpu = np.uint64(0) if len(comp_reemission_wvl_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_wvl_cdf_gpu), comp_reemission_wvl_cdf_gpu)
            assert num_comp == len(material.comp_reemission_time_cdf), 'component arrays must be same length'
            comp_reemission_time_cdf_gpu = [ga.to_gpu(interp_material_property(times, component)) for component in material.comp_reemission_time_cdf]
            self.material_data.append(comp_reemission_time_cdf_gpu)
            comp_reemission_time_cdf_gpu = np.uint64(0) if len(comp_reemission_time_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_time_cdf_gpu), comp_reemission_time_cdf_gpu)
            assert num_comp == len(material.comp_absorption_length), 'component arrays must be same length'
            comp_absorption_length_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_absorption_length]
            self.material_data.append(comp_absorption_length_gpu)
            comp_absorption_length_gpu = np.uint64(0) if len(comp_absorption_length_gpu) == 0 else make_gpu_struct(8*len(comp_absorption_length_gpu), comp_absorption_length_gpu)

            self.material_data.append(refractive_index_gpu)
            self.material_data.append(absorption_length_gpu)
            self.material_data.append(scattering_length_gpu)
            self.material_data.append(comp_reemission_prob_gpu)
            self.material_data.append(comp_reemission_wvl_cdf_gpu)
            self.material_data.append(comp_reemission_time_cdf_gpu)
            self.material_data.append(comp_absorption_length_gpu)

            material_gpu = \
                make_gpu_struct(material_struct_size,
                                [refractive_index_gpu, absorption_length_gpu,
                                 scattering_length_gpu,
                                 comp_reemission_prob_gpu,
                                 comp_reemission_wvl_cdf_gpu,
                                 comp_reemission_time_cdf_gpu,
                                 comp_absorption_length_gpu,
                                 np.uint32(num_comp),
                                 np.uint32(len(wavelengths)),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.uint32(len(times)),
                                 np.float32(time_step),
                                 np.float32(times[0])])

            self.material_ptrs.append(material_gpu)

        self.material_pointer_array = \
            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)

        self.surface_data = []
        self.surface_ptrs = []

        for i in range(len(geometry.unique_surfaces)):
            surface = geometry.unique_surfaces[i]

            if surface is None:
                # need something to copy to the surface array struct
                # that is the same size as a 64-bit pointer.
                # this pointer will never be used by the simulation.
                self.surface_ptrs.append(np.uint64(0))
                continue

            detect = interp_material_property(wavelengths, surface.detect)
            detect_gpu = ga.to_gpu(detect)
            absorb = interp_material_property(wavelengths, surface.absorb)
            absorb_gpu = ga.to_gpu(absorb)
            reemit = interp_material_property(wavelengths, surface.reemit)
            reemit_gpu = ga.to_gpu(reemit)
            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
            reflect_specular_gpu = ga.to_gpu(reflect_specular)
            eta = interp_material_property(wavelengths, surface.eta)
            eta_gpu = ga.to_gpu(eta)
            k = interp_material_property(wavelengths, surface.k)
            k_gpu = ga.to_gpu(k)
            reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)
            
            if surface.dichroic_props:
                props = surface.dichroic_props
                transmit_pointers = []
                reflect_pointers = []
                angles_gpu = ga.to_gpu(np.asarray(props.angles,dtype=np.float32))
                self.surface_data.append(angles_gpu)
                
                for i,angle in enumerate(props.angles):
                    dichroic_reflect = interp_material_property(wavelengths, props.dichroic_reflect[i])
                    dichroic_reflect_gpu = ga.to_gpu(dichroic_reflect)
                    self.surface_data.append(dichroic_reflect_gpu)
                    reflect_pointers.append(dichroic_reflect_gpu)
                    
                    dichroic_transmit = interp_material_property(wavelengths, props.dichroic_transmit[i])
                    dichroic_transmit_gpu = ga.to_gpu(dichroic_transmit)
                    self.surface_data.append(dichroic_transmit_gpu)
                    transmit_pointers.append(dichroic_transmit_gpu)
                
                reflect_arr_gpu = make_gpu_struct(8*len(reflect_pointers),reflect_pointers)
                self.surface_data.append(reflect_arr_gpu)
                transmit_arr_gpu = make_gpu_struct(8*len(transmit_pointers), transmit_pointers)
                self.surface_data.append(transmit_arr_gpu)
                dichroic_props = make_gpu_struct(dichroicprops_struct_size,[angles_gpu,reflect_arr_gpu,transmit_arr_gpu,np.uint32(len(props.angles))])
            else:
                dichroic_props = np.uint64(0) #NULL
            
            

            self.surface_data.append(detect_gpu)
            self.surface_data.append(absorb_gpu)
            self.surface_data.append(reemit_gpu)
            self.surface_data.append(reflect_diffuse_gpu)
            self.surface_data.append(reflect_specular_gpu)
            self.surface_data.append(eta_gpu)
            self.surface_data.append(k_gpu)
            self.surface_data.append(dichroic_props)
            
            surface_gpu = \
                make_gpu_struct(surface_struct_size,
                                [detect_gpu, absorb_gpu, reemit_gpu,
                                 reflect_diffuse_gpu,reflect_specular_gpu,
                                 eta_gpu, k_gpu, reemission_cdf_gpu,
                                 dichroic_props,
                                 np.uint32(surface.model),
                                 np.uint32(len(wavelengths)),
                                 np.uint32(surface.transmissive),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.float32(surface.thickness)])

            self.surface_ptrs.append(surface_gpu)

        self.surface_pointer_array = \
            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)

        self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3,
                                     write_combined=True)
        self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3,
                                      write_combined=True)
        self.vertices[:] = to_float3(geometry.mesh.vertices)
        self.triangles[:] = to_uint3(geometry.mesh.triangles)
        
        self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
        self.material_codes = ga.to_gpu(material_codes)
        colors = geometry.colors.astype(np.uint32)
        self.colors = ga.to_gpu(colors)
        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        gpu_free, gpu_total = cuda.mem_get_info()
        node_array_usage = geometry.bvh.nodes.nbytes

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100),
            n_nodes
            )
        
        self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index)) # forbid zero size
        self.extra_nodes = mapped_empty(shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype,
                                        write_combined=True)
        if split_index < n_nodes:
            logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]

        # See if there is enough memory to put the and/ortriangles back on the GPU
        gpu_free, gpu_total = cuda.mem_get_info()
        if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
            self.triangles = ga.to_gpu(self.triangles)
            logger.info('Optimization: Sufficient memory to move triangles onto GPU')

        gpu_free, gpu_total = cuda.mem_get_info()
        if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
            self.vertices = ga.to_gpu(self.vertices)
            logger.info('Optimization: Sufficient memory to move vertices onto GPU')

        self.gpudata = make_gpu_struct(geometry_struct_size,
                                       [Mapped(self.vertices), 
                                        Mapped(self.triangles),
                                        self.material_codes,
                                        self.colors, self.nodes,
                                        Mapped(self.extra_nodes),
                                        self.material_pointer_array,
                                        self.surface_pointer_array,
                                        self.world_origin,
                                        self.world_scale,
                                        np.int32(len(self.nodes))])

        self.geometry = geometry

        if print_usage:
            self.print_device_usage()
        logger.info(self.device_usage_str())
Exemplo n.º 9
0
    def __init__(self,
                 photons,
                 ncopies=1,
                 copy_flags=True,
                 copy_triangles=True,
                 copy_weights=True):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        if not copy_triangles:
            self.last_hit_triangles.fill(-1)
        if not copy_flags:
            self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32)
        else:
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        if not copy_weights:
            self.weights = ga.ones_like(self.last_hit_triangles,
                                        dtype=np.float32)
        else:
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        if copy_triangles:
            self.last_hit_triangles[:nphotons].set(
                photons.last_hit_triangles.astype(np.int32))
        if copy_flags:
            self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        if copy_weights:
            self.weights[:nphotons].set(photons.weights.astype(np.float32))
        self.evidx[:nphotons].set(photons.evidx.astype(np.uint32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                self.evidx,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Exemplo n.º 10
0
    def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6):
        if wavelengths is None:
            wavelengths = standard_wavelengths

        try:
            wavelength_step = np.unique(np.diff(wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        geometry_source = get_cu_source('geometry_types.h')
        material_struct_size = characterize.sizeof('Material', geometry_source)
        surface_struct_size = characterize.sizeof('Surface', geometry_source)
        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        self.material_data = []
        self.material_ptrs = []

        def interp_material_property(wavelengths, property):
            # note that it is essential that the material properties be
            # interpolated linearly. this fact is used in the propagation
            # code to guarantee that probabilities still sum to one.
            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)

        for i in range(len(geometry.unique_materials)):
            material = geometry.unique_materials[i]

            if material is None:
                raise Exception('one or more triangles is missing a material.')

            refractive_index = interp_material_property(wavelengths, material.refractive_index)
            refractive_index_gpu = ga.to_gpu(refractive_index)
            absorption_length = interp_material_property(wavelengths, material.absorption_length)
            absorption_length_gpu = ga.to_gpu(absorption_length)
            scattering_length = interp_material_property(wavelengths, material.scattering_length)
            scattering_length_gpu = ga.to_gpu(scattering_length)
            reemission_prob = interp_material_property(wavelengths, material.reemission_prob)
            reemission_prob_gpu = ga.to_gpu(reemission_prob)
            reemission_cdf = interp_material_property(wavelengths, material.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            self.material_data.append(refractive_index_gpu)
            self.material_data.append(absorption_length_gpu)
            self.material_data.append(scattering_length_gpu)
            self.material_data.append(reemission_prob_gpu)
            self.material_data.append(reemission_cdf_gpu)

            material_gpu = \
                make_gpu_struct(material_struct_size,
                                [refractive_index_gpu, absorption_length_gpu,
                                 scattering_length_gpu,
                                 reemission_prob_gpu,
                                 reemission_cdf_gpu,
                                 np.uint32(len(wavelengths)),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0])])

            self.material_ptrs.append(material_gpu)

        self.material_pointer_array = \
            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)

        self.surface_data = []
        self.surface_ptrs = []

        for i in range(len(geometry.unique_surfaces)):
            surface = geometry.unique_surfaces[i]

            if surface is None:
                # need something to copy to the surface array struct
                # that is the same size as a 64-bit pointer.
                # this pointer will never be used by the simulation.
                self.surface_ptrs.append(np.uint64(0))
                continue

            detect = interp_material_property(wavelengths, surface.detect)
            detect_gpu = ga.to_gpu(detect)
            absorb = interp_material_property(wavelengths, surface.absorb)
            absorb_gpu = ga.to_gpu(absorb)
            reemit = interp_material_property(wavelengths, surface.reemit)
            reemit_gpu = ga.to_gpu(reemit)
            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
            reflect_specular_gpu = ga.to_gpu(reflect_specular)
            eta = interp_material_property(wavelengths, surface.eta)
            eta_gpu = ga.to_gpu(eta)
            k = interp_material_property(wavelengths, surface.k)
            k_gpu = ga.to_gpu(k)
            reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            self.surface_data.append(detect_gpu)
            self.surface_data.append(absorb_gpu)
            self.surface_data.append(reemit_gpu)
            self.surface_data.append(reflect_diffuse_gpu)
            self.surface_data.append(reflect_specular_gpu)
            self.surface_data.append(eta_gpu)
            self.surface_data.append(k_gpu)
            self.surface_data.append(reemission_cdf_gpu)

            surface_gpu = \
                make_gpu_struct(surface_struct_size,
                                [detect_gpu, absorb_gpu, reemit_gpu,
                                 reflect_diffuse_gpu,reflect_specular_gpu,
                                 eta_gpu, k_gpu, reemission_cdf_gpu,
                                 np.uint32(surface.model),
                                 np.uint32(len(wavelengths)),
                                 np.uint32(surface.transmissive),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.float32(surface.thickness)])

            self.surface_ptrs.append(surface_gpu)

        self.surface_pointer_array = \
            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)

        self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3,
                                     write_combined=True)
        self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3,
                                      write_combined=True)
        self.vertices[:] = to_float3(geometry.mesh.vertices)
        self.triangles[:] = to_uint3(geometry.mesh.triangles)
        
        self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
        self.material_codes = ga.to_gpu(material_codes)
        colors = geometry.colors.astype(np.uint32)
        self.colors = ga.to_gpu(colors)
        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        gpu_free, gpu_total = cuda.mem_get_info()
        node_array_usage = geometry.bvh.nodes.nbytes

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100),
            n_nodes
            )
        
        self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index)) # forbid zero size
        self.extra_nodes = mapped_empty(shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype,
                                        write_combined=True)
        if split_index < n_nodes:
            logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]

        # See if there is enough memory to put the and/ortriangles back on the GPU
        gpu_free, gpu_total = cuda.mem_get_info()
        if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
            self.triangles = ga.to_gpu(self.triangles)
            logger.info('Optimization: Sufficient memory to move triangles onto GPU')

        gpu_free, gpu_total = cuda.mem_get_info()
        if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
            self.vertices = ga.to_gpu(self.vertices)
            logger.info('Optimization: Sufficient memory to move vertices onto GPU')

        self.gpudata = make_gpu_struct(geometry_struct_size,
                                       [Mapped(self.vertices), 
                                        Mapped(self.triangles),
                                        self.material_codes,
                                        self.colors, self.nodes,
                                        Mapped(self.extra_nodes),
                                        self.material_pointer_array,
                                        self.surface_pointer_array,
                                        self.world_origin,
                                        self.world_scale,
                                        np.int32(len(self.nodes))])

        self.geometry = geometry

        if print_usage:
            self.print_device_usage()
        logger.info(self.device_usage_str())
Exemplo n.º 11
0
    def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1,
                 seed=None, cl_context=None):
        """
        Generates photons from information in the steps_arr
        
        Parameters
        ----------
        steps_arr : numpy.array with shape=(N,10) dtype=np.float
           contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ]
           in the future could generalize this to many different time components.
           developed for liquid argon TPCs.
        multiple : float
           scale up the number of photons generated (not implemented yet)
        """
        self.steps_array = steps_arr
        self.nsteps = self.steps_array.shape[0]
        if multiple!=1.0:
            raise RuntimeError('Have not implemented scaling of the number of photons generated.')

        # ===========================
        # GEN PHOTONS
        tstart_genphotons =  time.time()
        # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here)
        # on the CPU, we scan the steps to determine the total number of photons using poisson statistics
        # we assume the user has seeded the random number generator to her liking
        tstart_nphotons = time.time()
        self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 )
        #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int )
        self.nphotons_per_step = self.steps_array[ self._nphotons, : ]
        self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() )
        print "NSTEPS: ",self.nsteps
        print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons
        # now we make an index array for which step we need to get info from
        self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 )
        current_index=0
        for n, n_per_step in enumerate( self.nphotons_per_step ):
            self.source_step_index[current_index:current_index+n_per_step] = n
            current_index += n_per_step
        # push everything to the GPU
        tstart_transfer = time.time()
        if api.is_gpu_api_cuda():
            # step info
            self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio )
            self.source_step_index_gpu = ga.to_gpu( self.source_step_index )
            # photon info
            self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) )
            self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
        elif api.is_gpu_api_opencl():
            cl_queue = cl.CommandQueue( cl_context )
            # step info
            self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu  = ga.to_device( cl_queue, self.step_fsratio )
            self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index )
            # photon info
            self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
        
        self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) )
        self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) )
        self.t.set( self.steps_array[:,3] )
        self.ncopies = ncopies
        self.true_nphotons = self.nphotons

        if self.ncopies!=1:
            raise ValueError('support for multiple copies not supported')

        if api.is_gpu_api_cuda():
            self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True )
        elif api.is_gpu_api_opencl():
            self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True )
        self.gpufuncs = GPUFuncs( self.gpumod )
        print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer

        # need random numbers
        tgpu = time.time()
        if seed==None:
            seed = 5
        rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context)
        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks):
            if api.is_gpu_api_cuda():
                self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu,
                                                    self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states,
                                                    self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights,
                                                    block=(nthreads_per_block,1,1), grid=(blocks, 1) )
            elif api.is_gpu_api_opencl():
                self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None,
                                                    np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data,
                                                    self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states.data,
                                                    self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, 
                                                    self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait()
                                                    
            else:
                raise RuntimeError("GPU API is neither CUDA nor OpenCL!")
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        tend_genphotons =  time.time()
        print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")"

        # Now load modules
        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu', options=api_options, include_source_directory=True)
        elif  api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)
Exemplo n.º 12
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin, 
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)
    
    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), 
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block, 
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles), Mapped(vertices),
                              world_origin, world_scale,
                              nodes, morton_codes,
                              block=(nthreads_per_block,1,1),
                              grid=(nblocks_this_iter,1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host
Exemplo n.º 13
0
    def __init__(self,
                 geometry,
                 wavelengths=None,
                 print_usage=False,
                 min_free_gpu_mem=300e6,
                 cl_context=None,
                 cl_queue=None):
        log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem)

        self.geometry = geometry
        self.instance_count += 1
        assert self.instance_count == 1, traceback.print_stack()

        self.metadata = Metadata()
        self.metadata(None, 'preinfo')
        self.metadata('a', "start")
        self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem

        if wavelengths is None:
            self.wavelengths = standard_wavelengths
        else:
            self.wavelengths = wavelengths

        try:
            self.wavelength_step = np.unique(np.diff(self.wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        # this is where things get difficult.
        # pycuda and pyopencl gives us very different methods for working with structs
        #geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        # Note, that unfortunately the data types returned are very different as the
        if api.is_gpu_api_cuda():
            self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
            self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
        elif api.is_gpu_api_opencl():
            self.material_data, materials_bytes_cl = self._package_material_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)
            self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)

        self.metadata('b', "after materials,surfaces")
        if api.is_gpu_api_opencl():
            self.metadata[
                'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl  # opencl, we have to track this ourselves

        # Load Vertices and Triangles
        if api.is_gpu_api_cuda():
            self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                         dtype=ga.vec.float3,
                                         write_combined=True)
            self.vertices4 = np.zeros(shape=(len(self.vertices), 4),
                                      dtype=np.float32)
            self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                          dtype=ga.vec.uint3,
                                          write_combined=True)
            self.triangles4 = np.zeros(shape=(len(self.triangles), 4),
                                       dtype=np.uint32)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.vertices4[:, :-1] = self.vertices.ravel().view(
                np.float32).reshape(len(self.vertices), 3)  # for textures
            self.triangles[:] = to_uint3(geometry.mesh.triangles)
            self.triangles4[:, :-1] = self.triangles.ravel().view(
                np.uint32).reshape(len(self.triangles), 3)  # for textures
        elif api.is_gpu_api_opencl():
            self.vertices = ga.empty(cl_queue,
                                     len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3)
            self.triangles = ga.empty(cl_queue,
                                      len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.triangles[:] = to_uint3(geometry.mesh.triangles)

        if api.is_gpu_api_cuda():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
        elif api.is_gpu_api_opencl():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
            #self.world_origin = geometry.bvh.world_coords.world_origin
            self.world_origin = ga.to_device(cl_queue, self.world_origin)
            print type(self.world_origin), self.world_origin
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        # Load material and surface indices into 8-bit codes
        # check if we've reached a complexity threshold
        if len(geometry.unique_materials) >= int(0xff):
            raise ValueError(
                'Number of materials to index has hit maximum of %d' %
                (int(0xff)))
        if len(geometry.unique_surfaces) >= int(0xff):
            raise ValueError(
                'Number of surfaces to index has hit maximum of %d' %
                (int(0xff)))
        # make bit code
        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(
                              np.uint32)
        if api.is_gpu_api_cuda():
            self.material_codes = ga.to_gpu(material_codes)
        elif api.is_gpu_api_opencl():
            self.material_codes = ga.to_device(cl_queue, material_codes)

        # assign color codes
        colors = geometry.colors.astype(np.uint32)
        if api.is_gpu_api_cuda():
            self.colors = ga.to_gpu(colors)
            self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
        elif api.is_gpu_api_opencl():
            self.colors = ga.to_device(cl_queue, colors)
            self.solid_id_map = ga.to_device(
                cl_queue, geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        self.metadata('c', "after colors, idmap")
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes
            self.metadata[
                'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl
            gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl +
                                    meshdef_nbytes_cl)

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(
                int((gpu_free - min_free_gpu_mem) /
                    geometry.bvh.nodes.itemsize), 100), n_nodes)
        print "split index=", split_index, " vs. total nodes=", n_nodes

        # push nodes to GPU
        if api.is_gpu_api_cuda():
            self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        elif api.is_gpu_api_opencl():
            self.nodes = ga.to_device(cl_queue,
                                      geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index))  # forbid zero size

        # left over nodes
        if api.is_gpu_api_cuda():
            self.extra_nodes = mapped_empty(shape=n_extra,
                                            dtype=geometry.bvh.nodes.dtype,
                                            write_combined=True)
        elif api.is_gpu_api_opencl():
            self.extra_nodes = ga.empty(cl_queue,
                                        shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype)

        if split_index < n_nodes:
            log.info('Splitting BVH between GPU and CPU memory at node %d' %
                     split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]
            splitting = 1
        else:
            splitting = 0

        self.metadata('d', "after nodes")
        if api.is_gpu_api_opencl():
            nodes_nbytes_cl = self.nodes.nbytes
            self.metadata[
                'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl
        self.metadata.array("d_nodes", geometry.bvh.nodes)
        self.metadata['d_split_index'] = split_index
        self.metadata['d_extra_nodes_count'] = n_extra
        self.metadata['d_splitting'] = splitting
        self.print_device_usage(cl_context=cl_context)

        # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']
        self.metadata.array('e_triangles', self.triangles)
        if api.is_gpu_api_cuda():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                self.triangles = ga.to_gpu(self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                #self.triangles = ga.to_device(cl_queue,self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0

        self.metadata('e', "after triangles")
        self.metadata['e_triangles_gpu'] = ftriangles_gpu

        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']

        self.metadata.array('f_vertices', self.vertices)

        if api.is_gpu_api_cuda():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                #self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0

        self.metadata('f', "after vertices")
        self.metadata['f_vertices_gpu'] = vertices_gpu

        if api.is_gpu_api_cuda():
            geometry_source = cutools.get_cu_source('geometry_types.h')
            geometry_struct_size = characterize.sizeof('Geometry',
                                                       geometry_source)
            self.gpudata = make_gpu_struct(geometry_struct_size, [
                Mapped(self.vertices),
                Mapped(self.triangles), self.material_codes, self.colors,
                self.nodes,
                Mapped(self.extra_nodes), self.material_pointer_array,
                self.surface_pointer_array, self.world_origin,
                self.world_scale,
                np.int32(len(self.nodes))
            ])
        elif api.is_gpu_api_opencl():
            # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays
            # We then build a geometry struct later in the kernel
            # provided below is example/test of passing the data
            #if True: # for debuggin
            if False:  #
                print "loading geometry_structs.cl"
                geostructsmod = cltools.get_cl_module(
                    "geometry_structs.cl",
                    cl_context,
                    options=cltools.cl_options,
                    include_source_directory=True)
                geostructsfunc = GPUFuncs(geostructsmod)
                geostructsfunc.make_geostruct(
                    cl_queue, (3, ), None, self.vertices.data,
                    self.triangles.data, self.material_codes.data,
                    self.colors.data, self.nodes.data, self.extra_nodes.data,
                    np.int32(len(geometry.unique_materials)),
                    self.material_data['refractive_index'].data,
                    self.material_data['absorption_length'].data,
                    self.material_data['scattering_length'].data,
                    self.material_data['reemission_prob'].data,
                    self.material_data['reemission_cdf'].data,
                    np.int32(len(geometry.unique_surfaces)),
                    self.surface_data['detect'].data,
                    self.surface_data['absorb'].data,
                    self.surface_data['reemit'].data,
                    self.surface_data['reflect_diffuse'].data,
                    self.surface_data['reflect_specular'].data,
                    self.surface_data['eta'].data, self.surface_data['k'].data,
                    self.surface_data['reemission_cdf'].data,
                    self.surface_data['model'].data,
                    self.surface_data['transmissive'].data,
                    self.surface_data['thickness'].data,
                    self.surface_data['nplanes'].data,
                    self.surface_data['wire_diameter'].data,
                    self.surface_data['wire_pitch'].data,
                    self.world_origin.data, self.world_scale,
                    np.int32(len(self.nodes)), self.material_data['n'],
                    self.material_data['step'],
                    self.material_data["wavelength0"])
                cl_queue.finish()
                self.material_codes.get()
                raise RuntimeError('bail')
        if print_usage:
            self.print_device_usage(cl_context=cl_context)
        log.info(self.device_usage_str(cl_context=cl_context))
        self.metadata('g', "after geometry struct")
Exemplo n.º 14
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin,
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles),
                             dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices),
                            dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)

    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block,
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles),
                              Mapped(vertices),
                              world_origin,
                              world_scale,
                              nodes,
                              morton_codes,
                              block=(nthreads_per_block, 1, 1),
                              grid=(nblocks_this_iter, 1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host