예제 #1
0
    def set_node_bounds(self):
        vector_data_t = get_vector_dtype(self.c_type, self.dim)
        dtype = ctype_to_dtype(self.c_type)

        self.node_xmin = self.allocate_node_prop(vector_data_t)
        self.node_xmax = self.allocate_node_prop(vector_data_t)
        self.node_hmax = self.allocate_node_prop(dtype)

        params = _get_node_bound_kernel_parameters(self.dim, self.c_type,
                                                   self.xvars)
        set_node_bounds = self.tree_bottom_up(params['args'],
                                              params['setup'],
                                              params['leaf_operation'],
                                              params['node_operation'],
                                              params['output_expr'],
                                              preamble=_get_macros_preamble(
                                                  self.c_type, self.sorted,
                                                  self.dim))
        set_node_bounds = profile_kernel(set_node_bounds,
                                         'set_node_bounds',
                                         backend='opencl')

        pa_gpu = self.pa.gpu
        dtype = ctype_to_dtype(self.c_type)

        args = [self, self.pids.dev]
        args += [getattr(pa_gpu, v).dev for v in self.xvars]
        args += [
            pa_gpu.h.dev,
            dtype(self.radius_scale), self.node_xmin.dev, self.node_xmax.dev,
            self.node_hmax.dev
        ]

        set_node_bounds(*args)
예제 #2
0
    def find_neighbor_lengths_elementwise(self, neighbor_cid_count,
                                          neighbor_cids, tree_src,
                                          neighbor_count):
        self.check_nnps_compatibility(tree_src)

        pa_gpu_dst = self.pa.gpu
        pa_gpu_src = tree_src.pa.gpu
        dtype = ctype_to_dtype(self.c_type)

        find_neighbor_counts = self.helper.get_kernel(
            'find_neighbor_counts_elementwise', sorted=self.sorted
        )
        find_neighbor_counts(self.unique_cids_map.dev, tree_src.pids.dev,
                             self.pids.dev,
                             self.cids.dev,
                             tree_src.pbounds.dev, self.pbounds.dev,
                             pa_gpu_src.x.dev, pa_gpu_src.y.dev,
                             pa_gpu_src.z.dev,
                             pa_gpu_src.h.dev,
                             pa_gpu_dst.x.dev, pa_gpu_dst.y.dev,
                             pa_gpu_dst.z.dev,
                             pa_gpu_dst.h.dev,
                             dtype(self.radius_scale),
                             neighbor_cid_count.dev,
                             neighbor_cids.dev,
                             neighbor_count.dev)
예제 #3
0
    def find_neighbor_lengths(self,
                              neighbor_cid_count,
                              neighbor_cids,
                              tree_src,
                              neighbor_count,
                              use_partitions=False):
        self.check_nnps_compatibility(tree_src)

        wgs = self.leaf_size
        pa_gpu_dst = self.pa.gpu
        pa_gpu_src = tree_src.pa.gpu
        dtype = ctype_to_dtype(self.c_type)

        def find_neighbor_counts_for_partition(partition_cids,
                                               partition_size,
                                               partition_wgs,
                                               q=None):
            find_neighbor_counts = self.helper.get_kernel(
                'find_neighbor_counts', sorted=self.sorted, wgs=wgs)
            find_neighbor_counts(partition_cids.dev,
                                 tree_src.pids.dev,
                                 self.pids.dev,
                                 self.cids.dev,
                                 tree_src.pbounds.dev,
                                 self.pbounds.dev,
                                 pa_gpu_src.x.dev,
                                 pa_gpu_src.y.dev,
                                 pa_gpu_src.z.dev,
                                 pa_gpu_src.h.dev,
                                 pa_gpu_dst.x.dev,
                                 pa_gpu_dst.y.dev,
                                 pa_gpu_dst.z.dev,
                                 pa_gpu_dst.h.dev,
                                 dtype(self.radius_scale),
                                 neighbor_cid_count.dev,
                                 neighbor_cids.dev,
                                 neighbor_count.dev,
                                 gs=(partition_wgs * partition_size, ),
                                 ls=(partition_wgs, ),
                                 queue=(get_queue() if q is None else q))

        if use_partitions and wgs > 32:
            if wgs < 128:
                wgs1 = 32
            else:
                wgs1 = 64

            m1, n1 = self.get_leaf_size_partitions(0, wgs1)

            find_neighbor_counts_for_partition(m1, n1, min(wgs, wgs1))
            m2, n2 = self.get_leaf_size_partitions(wgs1, wgs)
            find_neighbor_counts_for_partition(m2, n2, wgs)
        else:
            find_neighbor_counts_for_partition(self.unique_cids,
                                               self.unique_cid_count, wgs)
예제 #4
0
 def _bin(self):
     dtype = ctype_to_dtype(self.c_type)
     fill_particle_data = self.helper.get_kernel("fill_particle_data",
                                                 dim=self.dim,
                                                 xvars=self.xvars)
     pa_gpu = self.pa.gpu
     args = [getattr(pa_gpu, v).dev for v in self.xvars]
     args += [dtype(self.cell_size),
              self.make_vec(*[self.xmin[i] for i in range(self.dim)]),
              self.sfc.dev, self.pids.dev]
     fill_particle_data(*args)
예제 #5
0
    def find_neighbors(self, neighbor_cid_count, neighbor_cids, tree_src,
                       start_indices, neighbors, use_partitions=False):
        self.check_nnps_compatibility(tree_src)

        wgs = self.leaf_size if self.leaf_size % 32 == 0 else \
            self.leaf_size + 32 - self.leaf_size % 32
        pa_gpu_dst = self.pa.gpu
        pa_gpu_src = tree_src.pa.gpu
        dtype = ctype_to_dtype(self.c_type)

        def find_neighbors_for_partition(partition_cids, partition_size,
                                         partition_wgs, q=None):
            find_neighbors = self.helper.get_kernel('find_neighbors',
                                                    sorted=self.sorted,
                                                    wgs=wgs)
            find_neighbors(partition_cids.dev, tree_src.pids.dev,
                           self.pids.dev,
                           self.cids.dev,
                           tree_src.pbounds.dev, self.pbounds.dev,
                           pa_gpu_src.x.dev, pa_gpu_src.y.dev, pa_gpu_src.z.dev,
                           pa_gpu_src.h.dev,
                           pa_gpu_dst.x.dev, pa_gpu_dst.y.dev, pa_gpu_dst.z.dev,
                           pa_gpu_dst.h.dev,
                           dtype(self.radius_scale),
                           neighbor_cid_count.dev,
                           neighbor_cids.dev,
                           start_indices.dev,
                           neighbors.dev,
                           gs=(partition_wgs * partition_size,),
                           ls=(partition_wgs,),
                           queue=(get_queue() if q is None else q))

        if use_partitions and wgs > 32:
            if wgs < 128:
                wgs1 = 32
            else:
                wgs1 = 64

            m1, n1 = self.get_leaf_size_partitions(0, wgs1)
            fraction = (n1 / int(self.unique_cid_count))

            if fraction > 0.3:
                find_neighbors_for_partition(m1, n1, wgs1)
                m2, n2 = self.get_leaf_size_partitions(wgs1, wgs)
                assert (n1 + n2 == self.unique_cid_count)
                find_neighbors_for_partition(m2, n2, wgs)
                return
        else:
            find_neighbors_for_partition(
                self.unique_cids, self.unique_cid_count, wgs)