示例#1
0
    def sort(self):
        keyBits = self.keyBits
        self.radixSortKeysOnly(keyBits)

        clu.enqueue_copy(self.queue, src=self.dkeys, dst=self.sortedkeys)
        clu.enqueue_copy(self.queue, src=self.dvalues, dst=self.sortedvalues)

        self.keys[:] = self.sortedkeys[:self.n]
        self.values[:] = self.sortedvalues[:self.n]
示例#2
0
    def test_move_particles(self):
        """ Move the particles, set the dirty flag to true and recompute """
        
        particles = self.particles
        pa = particles.arrays[0]
        
        particles.setup_cl(self.ctx)    

        pa = self.particles.arrays[0]
        domain_manager = particles.domain_manager

        q = cl.CommandQueue(self.ctx)

        device_x = pa.get_cl_buffer('x')
        device_y = pa.get_cl_buffer('y')

        xnew = numpy.array([1,0,0,1]).astype(numpy.float32)
        ynew = numpy.array([0,0,1,1]).astype(numpy.float32)

        enqueue_copy(q, src=xnew, dst=device_x)
        enqueue_copy(q, src=ynew, dst=device_y)

        pa.set_dirty(True)

        particles.update()

        cellids = domain_manager.cellids['test']
        ix = domain_manager.ix['test']
        iy = domain_manager.iy['test']
        iz = domain_manager.iz['test']

        domain_manager.enqueue_copy()

        self.assertEqual( ix[0], 1 )
        self.assertEqual( ix[1], 0 )
        self.assertEqual( ix[2], 0 )
        self.assertEqual( ix[3], 1 )

        self.assertEqual( iy[0], 0 )
        self.assertEqual( iy[1], 0 )
        self.assertEqual( iy[2], 1 )
        self.assertEqual( iy[3], 1 )

        self.assertEqual( cellids[0], 1 )
        self.assertEqual( cellids[1], 0 )
        self.assertEqual( cellids[2], 2 )
        self.assertEqual( cellids[3], 3 )
示例#3
0
    def test_create_cl_buffers(self):
        """ Test the creation of the OpenCL arrays """

        pa = self.pa

        # create the OpenCL buffers
        pa.setup_cl(self.ctx, self.queue)

        for prop in pa.properties:

            cl_prop = 'cl_' + prop

            self.assertTrue( pa.cl_properties.has_key(cl_prop) )

            # get the OpenCL buffer for the property
            buffer = pa.get_cl_buffer(prop)

            # get the PySPH numpy array for the property
            pysph_arr = pa.get(prop)

            # read the contents of the OpenCL buffer in a dummy array
            _array = numpy.ones_like(pysph_arr)

            carray = pa.properties[prop] 
            dtype = carray.get_c_type()
            if pa.cl_precision == "single":            

                if dtype == "double":
                    _array = _array.astype(numpy.float32)
                    pysph_arr = pysph_arr.astype(numpy.float32)

                if dtype == "long":
                    _array = _array.astype(numpy.int32)
                    pysph_arr = pysph_arr.astype(numpy.int32)

            cl_utils.enqueue_copy(self.queue, dst=_array, src=buffer)

            self.assertEqual( len(_array), len(pysph_arr) )

            np = len(_array)

            for i in range(np):
                self.assertAlmostEqual( _array[i], pysph_arr[i], 10 )
示例#4
0
    def _permute(self, bits):
        """Launch the permute kernel

        Using the host-scanned thread histograms, this kernel shuffles
        the array values in the keys and values to perform the actual
        sort.

        We first copy the scanned histograms to the device, compute
        local mem size and then launch the kernel. After the kernel
        launch, the sorted keys and values are read back to the host
        for the next pass.

        """

        ctx = self.context
        q = self.queue

        # copy the scanned histograms to the device
        clu.enqueue_copy(q, src=self.histograms,
                         dst=self.dscanedhistograms)

        # global and local sizes
        global_sizes = (self.nelements/self.radices,)
        local_sizes = (self.group_size,)

        # allocate local memory for the permute kernel launch
        local_mem_size = self.group_size * self.radices * 2
        local_mem = cl.LocalMemory(size=local_mem_size)

        # enqueue the kernel for execution
        self.program.permute(q, global_sizes, local_sizes,
                             self.dkeys, self.dvalues,
                             self.dscanedhistograms,
                             bits, local_mem,
                             self.dsortedkeys, self.dsortedvalues).wait()

        # read sorted results back to the host
        clu.enqueue_copy(q, src=self.dsortedkeys, dst=self.sortedkeys)
        clu.enqueue_copy(q, src=self.dsortedvalues, dst=self.sortedvalues)
        
        clu.enqueue_copy(q, src=self.dsortedkeys, dst=self.dkeys)
        clu.enqueue_copy(q, src=self.dsortedvalues, dst=self.dvalues)
示例#5
0
    def _histogram(self, bits):
        """Launch the histogram kernel

        Each thread will load it's work region (256 values) into
        shared memory and will compute the histogram/frequency of
        occurance of each element. Remember that the implementation
        assumes that we sort the 32 bit keys and values 8 bits at a
        time and as such the histogram bins/buckets for each thread
        are also 256.

        We first copy the currenty unsorted data to the device before
        calculating local memory size and then launching the kernel.

        After the kernel launch, we read the computed thread
        histograms to the host, where these will be scanned.

        """

        ctx = self.context
        q = self.queue

        # global/local sizes
        global_sizes = (self.nelements/self.radices,)
        local_sizes = (self.group_size,)

        # copy the unsorted data to the device 
        # the unsorted data is in _keys and dkeys
        #clu.enqueue_copy(q, src=self._keys, dst=self.dkeys)

        # allocate the local memory for the histogram kernel
        local_mem_size = self.group_size * self.radices * 2
        local_mem = cl.LocalMemory(size=local_mem_size)

        # enqueue the kernel for execution
        self.program.histogram(q, global_sizes, local_sizes,
                               self.dkeys, self.dhistograms,
                               bits, local_mem).wait()

        # read the result to the host buffer
        clu.enqueue_copy(q, src=self.dhistograms, dst=self.histograms)
示例#6
0
    def enqueue_copy(self):
        """ Copy the Buffer contents to the host

        The cell counts buffer is copied to the host.

        """
        if self.with_cl:
            for pa in self.arrays:
                enqueue_copy(self.queue, dst=self.cellids[pa.name],
                             src=self.dcellids[pa.name])

                enqueue_copy(self.queue, dst=self.indices[pa.name],
                             src=self.dindices[pa.name])
                                
                enqueue_copy(queue=self.queue, dst=self.cell_counts[pa.name],
                             src=self.dcell_counts[pa.name])
示例#7
0
    def _cl_update(self):
        """Update the data structures.

        The following three steps are performed in order:

        (a) The particles are binned using a standard algorithm like the one
            for linked lists.

        (b) Sort the resulting cellids (keys) and indices (values) using
            the RadixSort objects

        (c) Compute the cell counts by examining the sorted cellids
        
        """
        # context and queue
        ctx = self.context
        q = self.queue

        # get the cell limits
        ncx, ncy, ncz = self.ncx, self.ncy, self.ncz
        mcx, mcy, mcz = self.mcx, self.mcy, self.mcz
        
        narrays = self.narrays
        for i in range(narrays):
            pa = self.arrays[i]
            np = pa.get_number_of_particles()

            # get launch parameters for this array
            global_sizes = (np,1,1)
            local_sizes = (1,1,1)

            x = pa.get_cl_buffer("x")
            y = pa.get_cl_buffer("y")
            z = pa.get_cl_buffer("z")
            
            # bin the particles to get device cellids
            cellids = self.cellids[pa.name]
            indices = self.indices[pa.name]
            cellc = self.cell_counts[pa.name]

            dcellids = self.dcellids[pa.name]
            dindices = self.dindices[pa.name]
            dcell_counts = self.dcell_counts[pa.name]
                        
            self.prog.bin( q, global_sizes, local_sizes,
                           x, y, z, dcellids, self.cell_size,
                           ncx, ncy, ncz, mcx, mcy, mcz ).wait()
                           
            # read the cellids into host array
            clu.enqueue_copy(q, src=dcellids, dst=cellids)

            # initialize the RadixSort with keys and values
            keys = cellids
            values = indices

            rsort = self.rsort[ pa.name ]
            rsort.initialize(keys, values, self.context)
            
            # sort the keys (cellids) and values (indices)
            rsort.sort()
            
            sortedcellids = rsort.dkeys

            self.prog.compute_cell_counts(q, global_sizes, local_sizes,
                                          sortedcellids, dcell_counts,
                                          numpy.uint32(self.ncells),
                                          numpy.uint32(np)).wait()

            # read the result back to host
            # THIS MAY NEED TO BE DONE OR WE COULD SIMPLY LET IT RESIDE
            # ON THE DEVICE.
            clu.enqueue_copy(q, src=dcell_counts, dst=self.cell_counts[pa.name])
示例#8
0
    def enqueue_copy(self):
        """ Copy the Buffer contents to the host

        The buffers copied are

        cellids, head, next, dix, diy, diz

        """

        if self.with_cl:
        
            for pa in self.arrays:
                enqueue_copy(self.queue, dst=self.cellids[pa.name],
                             src=self.dcellids[pa.name])

                enqueue_copy(self.queue, dst=self.head[pa.name],
                             src=self.dhead[pa.name])
        
                enqueue_copy(self.queue, dst=self.Next[pa.name],
                             src=self.dnext[pa.name])
        
                enqueue_copy(self.queue, dst=self.ix[pa.name],
                             src=self.dix[pa.name])
        
                enqueue_copy(self.queue, dst=self.iy[pa.name],
                             src=self.diy[pa.name])
        
                enqueue_copy(self.queue, dst=self.iz[pa.name],
                             src=self.diz[pa.name])