Пример #1
0
    def check_element_prop_threshold(self,
                                     element_property,
                                     threshold,
                                     refine_flags,
                                     debug,
                                     wait_for=None):
        actx = self.array_context
        knl = self.code_container.element_prop_threshold_checker()

        if debug:
            nelements_to_refine_prev = actx.to_numpy(
                actx.np.sum(refine_flags)).item()

        element_property = flatten(element_property, self.array_context)

        evt, out = knl(actx.queue,
                       element_property=element_property,
                       refine_flags=refine_flags,
                       refine_flags_updated=np.array(0),
                       threshold=np.array(threshold),
                       wait_for=wait_for)

        import pyopencl as cl
        cl.wait_for_events([evt])

        if debug:
            nelements_to_refine = actx.to_numpy(
                actx.np.sum(refine_flags)).item()
            if nelements_to_refine > nelements_to_refine_prev:
                logger.debug("refiner: found %d element(s) to refine",
                             nelements_to_refine - nelements_to_refine_prev)

        return out["refine_flags_updated"] == 1
Пример #2
0
    def check_expansion_disks_undisturbed_by_sources(self,
            stage1_density_discr, tree, peer_lists,
            expansion_disturbance_tolerance,
            refine_flags,
            debug, wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.expansion_disk_undisturbed_by_sources_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        from pytential import bind, sym
        center_danger_zone_radii = flatten(
            bind(stage1_density_discr,
                sym.expansion_radii(stage1_density_discr.ambient_dim,
                    granularity=sym.GRANULARITY_CENTER))(self.array_context))

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_panel_to_center_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                center_danger_zone_radii,
                expansion_disturbance_tolerance,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxcenters),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Пример #3
0
    def _write_time_shift(self, queues):
        """Estimate the time shift between devices with respect to a
        global clock. This is important for evaluating relative device
        runtimes with respect to each other.
        """
        # Get only a single command queue for a device on which we will
        # determine the zero time of a device.
        unique_queues = []
        devices = []
        for queue in queues:
            if queue.device not in devices:
                unique_queues.append(queue)
                devices.append(queue.device)

        starts = {}
        start = time.time()
        for i in range(len(devices)):
            starts[devices[i]] = cl.enqueue_marker(unique_queues[i])
        d_t = (time.time() - start) * q.s

        cl.wait_for_events(list(starts.values()))

        for device in starts:
            starts[device] = starts[device].profile.queued

        # Write the zero time for every device into the profiling file.
        self._profile_file.write("# device\tinitial_time\n")
        for device in starts:
            self._cldevices[device] = self._cldevice_next()
            self._profile_file.write("%d\t%d\n" %
                                     (self._cldevices[device], starts[device]))
        self._profile_file.write("# END_INIT_T0\n")
        self._profile_file.write("# Relative device timing error\n%g\n" %
                                 d_t.rescale(q.ns))
        self._profile_file.write("# END_INIT\n")
Пример #4
0
    def check_element_prop_threshold(self,
                                     element_property,
                                     threshold,
                                     refine_flags,
                                     debug,
                                     wait_for=None):
        knl = self.code_container.element_prop_threshold_checker()

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        from pytential.utils import flatten_if_needed
        element_property = flatten_if_needed(self.array_context,
                                             element_property)

        evt, out = knl(self.queue,
                       element_property=element_property,
                       refine_flags=refine_flags,
                       refine_flags_updated=np.array(0),
                       threshold=np.array(threshold),
                       wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return (out["refine_flags_updated"] == 1).all()
Пример #5
0
    def _write_time_shift(self, queues):
        """Estimate the time shift between devices with respect to a
        global clock. This is important for evaluating relative device
        runtimes with respect to each other.
        """
        # Get only a single command queue for a device on which we will
        # determine the zero time of a device.
        unique_queues = []
        devices = []
        for queue in queues:
            if queue.device not in devices:
                unique_queues.append(queue)
                devices.append(queue.device)

        starts = {}
        start = time.time()
        for i in range(len(devices)):
            starts[devices[i]] = cl.enqueue_marker(unique_queues[i])
        d_t = (time.time() - start) * q.s

        cl.wait_for_events(starts.values())

        for device in starts:
            starts[device] = starts[device].profile.queued

        # Write the zero time for every device into the profiling file.
        self._profile_file.write("# device\tinitial_time\n")
        for device in starts:
            self._cldevices[device] = self._cldevice_next()
            self._profile_file.write("%d\t%d\n" % (self._cldevices[device],
                                                   starts[device]))
        self._profile_file.write("# END_INIT_T0\n")
        self._profile_file.write("# Relative device timing error\n%g\n" %
                                 d_t.rescale(q.ns))
        self._profile_file.write("# END_INIT\n")
Пример #6
0
def test_area_query_elwise(ctx_getter, dims, do_plot=False):
    ctx = ctx_getter()
    queue = cl.CommandQueue(ctx)

    nparticles = 10**5
    dtype = np.float64

    particles = make_normal_particle_array(queue, nparticles, dims, dtype)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(particles[0].get(), particles[1].get(), "x")

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True)

    nballs = 10**4
    ball_centers = make_normal_particle_array(queue, nballs, dims, dtype)
    ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1)

    from boxtree.area_query import (
        AreaQueryElementwiseTemplate, PeerListFinder)

    template = AreaQueryElementwiseTemplate(
        extra_args="""
            coord_t *ball_radii,
            %for ax in AXIS_NAMES[:dimensions]:
                coord_t *ball_${ax},
            %endfor
        """,
        ball_center_and_radius_expr="""
            %for ax in AXIS_NAMES[:dimensions]:
                ${ball_center}.${ax} = ball_${ax}[${i}];
            %endfor
            ${ball_radius} = ball_radii[${i}];
        """,
        leaf_found_op="")

    peer_lists, evt = PeerListFinder(ctx)(queue, tree)

    kernel = template.generate(
        ctx,
        dims,
        tree.coord_dtype,
        tree.box_id_dtype,
        peer_lists.peer_list_starts.dtype,
        tree.nlevels)

    evt = kernel(
        *template.unwrap_args(
            tree, peer_lists, ball_radii, *ball_centers),
        queue=queue,
        wait_for=[evt],
        range=slice(len(ball_radii)))

    cl.wait_for_events([evt])
Пример #7
0
    def find_peer_lists(self, tree):
        plf = self.code_container.peer_list_finder()
        peer_lists, evt = plf(self.queue, tree)

        import pyopencl as cl
        cl.wait_for_events([evt])

        return peer_lists
Пример #8
0
    def check_sufficient_source_quadrature_resolution(self,
                                                      stage2_density_discr,
                                                      tree,
                                                      peer_lists,
                                                      refine_flags,
                                                      debug,
                                                      wait_for=None):
        actx = self.array_context

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(tree.nlevels,
                                                     MAX_LEVELS_INCREMENT)

        knl = self.code_container.sufficient_source_quadrature_resolution_checker(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, tree.particle_id_dtype,
            max_levels)
        if debug:
            nelements_to_refine_prev = actx.to_numpy(
                actx.np.sum(refine_flags)).item()

        found_element_to_refine = actx.zeros(1, dtype=np.int32)
        found_element_to_refine.finish()

        from pytential import bind, sym
        dd = sym.as_dofdesc(sym.GRANULARITY_ELEMENT).to_stage2()
        source_danger_zone_radii_by_element = flatten(
            bind(
                stage2_density_discr,
                sym._source_danger_zone_radii(stage2_density_discr.ambient_dim,
                                              dofdesc=dd))(self.array_context),
            self.array_context)
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        evt = knl(*unwrap_args(
            tree, peer_lists, tree.box_to_qbx_center_starts,
            tree.box_to_qbx_center_lists, tree.qbx_element_to_source_starts,
            tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start,
            tree.sorted_target_ids, source_danger_zone_radii_by_element,
            tree.nqbxelements, refine_flags, found_element_to_refine,
            *tree.sources),
                  range=slice(tree.nqbxsources),
                  queue=actx.queue,
                  wait_for=wait_for)

        import pyopencl as cl
        cl.wait_for_events([evt])

        if debug:
            nelements_to_refine = actx.to_numpy(
                actx.np.sum(refine_flags)).item()
            if nelements_to_refine > nelements_to_refine_prev:
                logger.debug("refiner: found %d element(s) to refine",
                             nelements_to_refine - nelements_to_refine_prev)

        return actx.to_numpy(found_element_to_refine)[0] == 1
Пример #9
0
    def check_expansion_disks_undisturbed_by_sources(self,
            lpot_source, tree, peer_lists,
            expansion_disturbance_tolerance,
            refine_flags,
            debug, wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.expansion_disk_undisturbed_by_sources_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        center_danger_zone_radii = lpot_source._expansion_radii("ncenters")

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_panel_to_center_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                center_danger_zone_radii,
                expansion_disturbance_tolerance,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxcenters),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Пример #10
0
    def check_sufficient_source_quadrature_resolution(
            self, lpot_source, tree, peer_lists, refine_flags, debug,
            wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.sufficient_source_quadrature_resolution_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)
        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()

        from pytential import bind, sym
        source_danger_zone_radii_by_panel = bind(lpot_source,
                sym._source_danger_zone_radii(
                    lpot_source.ambient_dim,
                    dofdesc=sym.GRANULARITY_ELEMENT))(self.queue)
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_center_starts,
                tree.box_to_qbx_center_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                source_danger_zone_radii_by_panel,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxsources),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Пример #11
0
 def readReturn(self, idx):
     ev0 = cl.enqueue_copy(self.queue_,
                           src=self.ret_[idx],
                           dest=self.retNonces_,
                           is_blocking=False)
     ev1 = cl.enqueue_copy(self.queue_,
                           src=self.retMap_[idx],
                           dest=self.retMaps_,
                           is_blocking=False)
     cl.wait_for_events([ev0, ev1])
     return self.retMaps_, self.retNonces_
Пример #12
0
def test_area_query_elwise(ctx_factory, dims, do_plot=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    nparticles = 10**5
    dtype = np.float64

    particles = make_normal_particle_array(queue, nparticles, dims, dtype)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(particles[0].get(), particles[1].get(), "x")

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True)

    nballs = 10**4
    ball_centers = make_normal_particle_array(queue, nballs, dims, dtype)
    ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1)

    from boxtree.area_query import (AreaQueryElementwiseTemplate,
                                    PeerListFinder)

    template = AreaQueryElementwiseTemplate(extra_args="""
            coord_t *ball_radii,
            %for ax in AXIS_NAMES[:dimensions]:
                coord_t *ball_${ax},
            %endfor
        """,
                                            ball_center_and_radius_expr="""
            %for ax in AXIS_NAMES[:dimensions]:
                ${ball_center}.${ax} = ball_${ax}[${i}];
            %endfor
            ${ball_radius} = ball_radii[${i}];
        """,
                                            leaf_found_op="")

    peer_lists, evt = PeerListFinder(ctx)(queue, tree)

    kernel = template.generate(ctx, dims, tree.coord_dtype, tree.box_id_dtype,
                               peer_lists.peer_list_starts.dtype, tree.nlevels)

    evt = kernel(*template.unwrap_args(tree, peer_lists, ball_radii,
                                       *ball_centers),
                 queue=queue,
                 wait_for=[evt],
                 range=slice(len(ball_radii)))

    cl.wait_for_events([evt])
Пример #13
0
 def run(self):
     """Run in a separate thread and serve the incoming events."""
     while not self.finish or not self._events.empty():
         try:
             event, kernel = self._events.get(timeout=0.1)
             cl.wait_for_events([event])
             self._process(event, kernel)
             self._events.task_done()
         except Empty:
             pass
         except Exception as exc:
             LOG.error(exc)
     self._profile_file.close()
Пример #14
0
 def run(self):
     """Run in a separate thread and serve the incoming events."""
     while not self.finish or not self._events.empty():
         try:
             event, kernel = self._events.get(timeout=0.1)
             cl.wait_for_events([event])
             self._process(event, kernel)
             self._events.task_done()
         except Empty:
             pass
         except Exception as exc:
             LOG.error(exc)
     self._profile_file.close()
Пример #15
0
    def inspect_geo_data(insn, bound_expr, geo_data):
        nonlocal sizes, nsources, ncenters
        tree = geo_data.tree().with_queue(queue)

        from boxtree.area_query import PeerListFinder
        plf = PeerListFinder(queue.context)
        pl, evt = plf(queue, tree)

        # Perform an area query around each QBX center, counting the
        # neighborhood sizes.
        knl = NeighborhoodCounter.generate(
                queue.context,
                tree.dimensions,
                tree.coord_dtype,
                tree.box_id_dtype,
                tree.box_id_dtype,
                tree.nlevels,
                extra_type_aliases=(('particle_id_t', tree.particle_id_dtype),))

        centers = geo_data.centers()
        search_radii = radius * geo_data.expansion_radii().with_queue(queue)

        ncenters = len(search_radii)
        nsources = tree.nsources
        sizes = cl.array.zeros(queue, ncenters, np.int32)

        assert nsources == lpot_source.quad_stage2_density_discr.nnodes

        coords = []
        coords.extend(tree.sources)
        coords.extend(centers)

        evt = knl(
                *NeighborhoodCounter.unwrap_args(
                    tree,
                    pl,
                    tree.box_source_starts,
                    tree.box_source_counts_cumul,
                    search_radii,
                    sizes,
                    *coords),
                range=slice(ncenters),
                queue=queue,
                wait_for=[evt])

        cl.wait_for_events([evt])

        return False  # no need to do the actual FMM
Пример #16
0
    def _finish_profile_events(self) -> None:
        # First, wait for completion of all events
        if self.profile_events:
            cl.wait_for_events([pevt.cl_event for pevt in self.profile_events])

        # Then, collect all events and store them
        for t in self.profile_events:
            program = t.program
            r = self._get_kernel_stats(program, t.args_tuple)
            time = t.cl_event.profile.end - t.cl_event.profile.start

            new = ProfileResult(time, r.flops, r.bytes_accessed,
                                r.footprint_bytes)

            self.profile_results.setdefault(program, []).append(new)

        self.profile_events = []
Пример #17
0
def get_interleaved_centers(queue, lpot_source):
    """
    Return an array of shape (dim, ncenters) in which interior centers are placed
    next to corresponding exterior centers.
    """
    knl = get_interleaver_kernel(lpot_source.density_discr.real_dtype)
    int_centers = get_centers_on_side(lpot_source, -1)
    ext_centers = get_centers_on_side(lpot_source, +1)

    result = []
    wait_for = []

    for int_axis, ext_axis in zip(int_centers, ext_centers):
        axis = cl.array.empty(queue, len(int_axis) * 2, int_axis.dtype)
        evt, _ = knl(queue, src1=int_axis, src2=ext_axis, dst=axis)
        result.append(axis)
        wait_for.append(evt)

    cl.wait_for_events(wait_for)

    return result
Пример #18
0
    def _wait_and_transfer_profile_events(self) -> None:
        # First, wait for completion of all events
        if self.profile_events:
            cl.wait_for_events([pevt.cl_event for pevt in self.profile_events])

        # Then, collect all events and store them
        for t in self.profile_events:
            t_unit = t.translation_unit
            if isinstance(t_unit, lp.TranslationUnit):
                name = t_unit.default_entrypoint.name
            else:
                # It's actually a cl.Kernel
                name = t_unit.function_name

            r = self._get_kernel_stats(t_unit, t.args_tuple)
            time = t.cl_event.profile.end - t.cl_event.profile.start

            new = SingleCallKernelProfile(time, r.flops, r.bytes_accessed,
                                          r.footprint_bytes)

            self.profile_results.setdefault(name, []).append(new)

        self.profile_events = []
Пример #19
0
    def copy_outputs(self, exec_event, output_defs, output_buffers):
        """Copies outputs of a kernel execution to host, returning a list of `numpy.ndarray`s.

        You can obtain the first two arguments, `exec_event` and `output_buffers`,
        by calling the `run_kernel` method.

        Args:
            exec_event (pyopencl.Event): Kernel execution event
            output_defs (list): See documentation for `run_kernel`
            output_buffers (list): List of `pyopencl.Buffer`s containing kernel outputs.
        """

        output_arrays = [
            np.zeros(length, dtype=dtype) for (length, dtype) in output_defs
        ]
        copy_events = [
            cl.enqueue_copy(self.cmd_queue,
                            host_buf,
                            device_buf,
                            wait_for=[exec_event])
            for (host_buf, device_buf) in zip(output_arrays, output_buffers)
        ]
        cl.wait_for_events(copy_events)
        return output_arrays
Пример #20
0
    def check_element_prop_threshold(self, element_property, threshold, refine_flags,
            debug, wait_for=None):
        knl = self.code_container.element_prop_threshold_checker()

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        evt, out = knl(self.queue,
                       element_property=element_property,
                       # lpot_source._coarsest_quad_resolution("npanels")),
                       refine_flags=refine_flags,
                       refine_flags_updated=np.array(0),
                       threshold=np.array(threshold),
                       wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return (out["refine_flags_updated"].get() == 1).all()
Пример #21
0
    def __forward_ocl_vec4_interleaved(self, x, h0, c0, sm=False):
        def interleave(matrix):
            if len(matrix.shape) == 1:
                return np.squeeze(interleave(np.expand_dims(matrix, axis=0)),
                                  axis=0).copy()
            new = np.zeros_like(matrix)
            a, b, c, d = np.hsplit(matrix, 4)
            simd = 4
            rng = np.arange(0, matrix.shape[1], simd)
            new[:, rng] = a
            new[:, rng + 1] = b
            new[:, rng + 2] = c
            new[:, rng + 3] = d
            return new.copy()

        seq_len = x.shape[0]
        batch_size = x.shape[1]
        weights = interleave(
            np.concatenate((np.transpose(self.Wi), np.transpose(self.Wh)),
                           0)).T.astype(np.float32)
        ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32)
        hy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        cy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        hy[0] = h0
        cy[0] = c0

        platform = cl.get_platforms()[0]  # Select the first platform [0]
        device = platform.get_devices()[
            0]  # Select the first device on this platform [0]
        context = cl.Context([device])  # Create a context with your device
        queue = cl.CommandQueue(
            context)  # Create a command queue with your context

        # Allocate on device
        x_gpu = cl.Buffer(context,
                          cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=x.copy('C'))  # cuda_alloc(x)
        weights_gpu = cl.Buffer(
            context, cl.mem_flags.COPY_HOST_PTR,
            hostbuf=weights.copy('C'))  # cuda_alloc(weights)
        bias_gpu = cl.Buffer(context,
                             cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=interleave(
                                 self.B).copy('C'))  # cuda_alloc(self.B)
        ifcos_gpu = cl.Buffer(context,
                              cl.mem_flags.READ_WRITE
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=ifcos.copy('C'))  # cuda_alloc(ifcos)
        hy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=hy.copy('C'))  # cuda_alloc(hy)
        cy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=cy.copy('C'))  # cuda_alloc(cy)

        kernelSource = ''
        kernelFilename = 'lstm_vec4_interleaved.cl'
        with open(kernelFilename, 'r') as file:
            kernelSource = file.read()

        program = cl.Program(context, kernelSource).build()

        M = np.int32(batch_size)
        K = np.int32(self.input_size + self.hidden_size)
        N = np.int32(4 * self.hidden_size)
        gemm_lws = (8, 8, 1)
        gemm_gws = int(M), int(N), gemm_lws[2]
        eltwise_lws = (8, 8, 1)
        eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2]
        events = []
        for i in range(0, seq_len):
            gemm_kernel = program.lstm_gemm
            gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu,
                                 ifcos_gpu, M, K, N, np.int32(self.input_size),
                                 np.int32(self.hidden_size), np.int32(i))
            ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws,
                                             gemm_lws)
            events.append(ev1)
            cl.enqueue_barrier(queue)
            eltwise_kernel = program.lstm_eltwise
            eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu,
                                    np.int32(self.hidden_size),
                                    np.int32(batch_size), np.int32(i))
            ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel,
                                             eltwise_gws, eltwise_lws)
            events.append(ev2)
            cl.enqueue_barrier(queue)

        timer_start = datetime.datetime.now()
        cl.wait_for_events(events)
        execution_time = (datetime.datetime.now() -
                          timer_start).total_seconds() * 1000

        cl.enqueue_copy(queue, ifcos, ifcos_gpu)
        cl.enqueue_copy(queue, hy, hy_gpu)
        cl.enqueue_copy(queue, cy, cy_gpu)

        queue.finish()
        # Copy the data for array c back to the host

        results = hy[1:], hy[-1:], cy[-1:]

        return results, execution_time
Пример #22
0
    def __forward_ocl_naive(self, x, h0, c0, acc):
        seq_len = x.shape[0]
        batch_size = x.shape[1]
        weights = np.concatenate(
            (np.transpose(self.Wi), np.transpose(self.Wh)),
            0).astype(np.float32)
        ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32)
        hy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        cy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        hy[0] = h0
        cy[0] = c0

        platform = cl.get_platforms()[0]  # Select the first platform [0]
        device = platform.get_devices()[
            0]  # Select the first device on this platform [0]
        context = cl.Context([device])  # Create a context with your device
        queue = cl.CommandQueue(
            context)  # Create a command queue with your context

        # Allocate on device
        x_gpu = cl.Buffer(context,
                          cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=x.copy('C'))  # cuda_alloc(x)
        weights_gpu = cl.Buffer(
            context, cl.mem_flags.COPY_HOST_PTR,
            hostbuf=weights.copy('C'))  # cuda_alloc(weights)
        bias_gpu = cl.Buffer(context,
                             cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=self.B.copy('C'))  # cuda_alloc(self.B)
        ifcos_gpu = cl.Buffer(context,
                              cl.mem_flags.READ_WRITE
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=ifcos.copy('C'))  # cuda_alloc(ifcos)
        hy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=hy.copy('C'))  # cuda_alloc(hy)
        cy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=cy.copy('C'))  # cuda_alloc(cy)

        kernelSource = ''
        kernelFilename = 'lstm_naive.cl' if acc is False else 'lstm_naive_acc.cl'
        with open(kernelFilename, 'r') as file:
            kernelSource = file.read()

        program = cl.Program(context, kernelSource).build()

        M = np.int32(batch_size)
        K = np.int32(self.input_size + self.hidden_size)
        N = np.int32(4 * self.hidden_size)

        gemm_lws = (1, 1, 1)
        gemm_gws = int(M), int(N), gemm_lws[2]
        eltwise_lws = (1, 1, 1)
        eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2]
        events = []
        for i in range(0, seq_len):
            gemm_kernel = program.lstm_gemm
            gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu,
                                 ifcos_gpu, M, K, N, np.int32(self.input_size),
                                 np.int32(self.hidden_size), np.int32(i))
            ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws,
                                             gemm_lws)
            events.append(ev1)
            cl.enqueue_barrier(queue)
            eltwise_kernel = program.lstm_eltwise
            eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu,
                                    np.int32(self.hidden_size),
                                    np.int32(batch_size), np.int32(i))
            ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel,
                                             eltwise_gws, eltwise_lws)
            events.append(ev2)
            cl.enqueue_barrier(queue)

        timer_start = datetime.datetime.now()
        cl.wait_for_events(events)
        execution_time = (datetime.datetime.now() -
                          timer_start).total_seconds() * 1000

        cl.enqueue_copy(queue, ifcos, ifcos_gpu)
        cl.enqueue_copy(queue, hy, hy_gpu)
        cl.enqueue_copy(queue, cy, cy_gpu)

        queue.finish()

        results = hy[1:], hy[-1:], cy[-1:]

        return results, execution_time
Пример #23
0
 def find_peer_lists(self, tree):
     plf = self.code_container.peer_list_finder()
     peer_lists, evt = plf(self.queue, tree)
     cl.wait_for_events([evt])
     return peer_lists
Пример #24
0
def test_speed(ctx, rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [
        rng.uniform(-1, 1, size=(m, n)).astype('float32')
        for m, n in zip(ms, ns)
    ]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta,
                                   clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)
Пример #25
0
def prefix_sum(cq, scanner, values_buf):
    cl.wait_for_events([scanner.prefix_sum(cq, values_buf)])
Пример #26
0
    def mark_targets(self,
                     places,
                     dofdesc,
                     tree,
                     peer_lists,
                     target_status,
                     debug,
                     wait_for=None):
        from pytential import bind, sym
        ambient_dim = places.ambient_dim

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.target_marker(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, tree.particle_id_dtype,
            max_levels)

        found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32)
        found_target_close_to_panel.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice]
        sources = [
            axis.with_queue(self.queue)[source_slice] for axis in tree.sources
        ]

        tunnel_radius_by_source = flatten(
            bind(places,
                 sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))(
                     self.array_context))

        # Target-marking algorithm (TGTMARK):
        #
        # (1) Use a space invader query to tag each leaf box that intersects with the
        # "near-source-detection tunnel" with the distance to the closest source.
        #
        # (2) Do an area query around all targets with the radius resulting
        # from the space invader query, enumerate sources in that vicinity.
        # If a source is found whose distance to the target is less than the
        # source's tunnel radius, mark that target as pending.
        # (or below: mark the source for refinement)

        # Note that this comment is referred to below by "TGTMARK". If you
        # remove this comment or change the algorithm here, make sure that
        # the reference below is still accurate.

        # Trade off for space-invaders vs directly tagging targets in
        # endangered boxes:
        #
        # (-) More complicated
        # (-) More actual work
        # (+) Taking the point of view of the targets could potentially lead to
        # more parallelism, if you think of the targets as unbounded while the
        # sources are fixed (which sort of makes sense, given that the number
        # of targets per box is not bounded).

        box_to_search_dist, evt = self.code_container.space_invader_query()(
            self.queue,
            tree,
            sources,
            tunnel_radius_by_source,
            peer_lists,
            wait_for=wait_for)
        wait_for = [evt]

        evt = knl(*unwrap_args(tree, peer_lists, tree.box_to_qbx_source_starts,
                               tree.box_to_qbx_source_lists,
                               tree.qbx_user_source_slice.start,
                               tree.qbx_user_target_slice.start,
                               tree.sorted_target_ids, tunnel_radius_by_source,
                               box_to_search_dist, target_status,
                               found_target_close_to_panel, *tree.sources),
                  range=slice(tree.nqbxtargets),
                  queue=self.queue,
                  wait_for=wait_for)

        if debug:
            target_status.finish()
            # Marked target = 1, 0 otherwise
            marked_target_count = cl.array.sum(target_status).get()
            logger.debug(
                "target association: {}/{} targets marked close to panels".
                format(marked_target_count, tree.nqbxtargets))

        cl.wait_for_events([evt])

        return (found_target_close_to_panel == 1).all().get()
Пример #27
0
    def try_find_centers(self, tree, peer_lists, lpot_source,
                         target_status, target_flags, target_assoc,
                         target_association_tolerance, debug, wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.center_finder(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            target_status.finish()
            marked_target_count = int(cl.array.sum(target_status).get())

        # Perform a space invader query over the centers.
        center_slice = (
                tree.sorted_target_ids[tree.qbx_user_center_slice]
                .with_queue(self.queue))
        centers = [
                axis.with_queue(self.queue)[center_slice] for axis in tree.sources]
        expansion_radii_by_center = \
                lpot_source._expansion_radii("ncenters").with_queue(self.queue)
        expansion_radii_by_center_with_tolerance = \
                expansion_radii_by_center * (1 + target_association_tolerance)

        # Idea:
        #
        # (1) Tag leaf boxes around centers with max distance to usable center.
        # (2) Area query from targets with those radii to find closest eligible
        # center.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                centers,
                expansion_radii_by_center_with_tolerance,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        min_dist_to_center = cl.array.empty(
                self.queue, tree.nqbxtargets, tree.coord_dtype)
        min_dist_to_center.fill(np.inf)

        wait_for.extend(min_dist_to_center.events)

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_center_starts,
                tree.box_to_qbx_center_lists,
                tree.qbx_user_center_slice.start,
                tree.qbx_user_target_slice.start,
                tree.sorted_target_ids,
                expansion_radii_by_center_with_tolerance,
                box_to_search_dist,
                target_flags,
                target_status,
                target_assoc.target_to_center,
                min_dist_to_center,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            target_status.finish()
            # Associated target = 2, marked target = 1
            ntargets_associated = (
                int(cl.array.sum(target_status).get()) - marked_target_count)
            assert ntargets_associated >= 0
            logger.debug("target association: {} targets were assigned centers"
                         .format(ntargets_associated))

        cl.wait_for_events([evt])
Пример #28
0
 def event_waiter2(e, key):
     cl.wait_for_events([e])
     status[key] = True
Пример #29
0
    def mark_panels_for_refinement(self,
                                   places,
                                   dofdesc,
                                   tree,
                                   peer_lists,
                                   target_status,
                                   refine_flags,
                                   debug,
                                   wait_for=None):
        from pytential import bind, sym
        ambient_dim = places.ambient_dim

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.refiner_for_failed_target_association(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, tree.particle_id_dtype,
            max_levels)

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.user_source_ids[tree.qbx_user_source_slice]
        sources = [
            axis.with_queue(self.queue)[source_slice] for axis in tree.sources
        ]

        tunnel_radius_by_source = flatten(
            bind(places,
                 sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))(
                     self.array_context))

        # see (TGTMARK) above for algorithm.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
            self.queue,
            tree,
            sources,
            tunnel_radius_by_source,
            peer_lists,
            wait_for=wait_for)
        wait_for = [evt]

        evt = knl(*unwrap_args(
            tree, peer_lists, tree.box_to_qbx_source_starts,
            tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts,
            tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start,
            tree.nqbxpanels, tree.sorted_target_ids, tunnel_radius_by_source,
            target_status, box_to_search_dist, refine_flags,
            found_panel_to_refine, *tree.sources),
                  range=slice(tree.nqbxtargets),
                  queue=self.queue,
                  wait_for=wait_for)

        if debug:
            refine_flags.finish()
            # Marked panel = 1, 0 otherwise
            marked_panel_count = cl.array.sum(refine_flags).get()
            logger.debug(
                "target association: {} panels flagged for refinement".format(
                    marked_panel_count))

        cl.wait_for_events([evt])

        return (found_panel_to_refine == 1).all().get()
def gpu_generate_next_flock(step, queue, intermediary_events, kernels,
                            gpu_params, buffers, flocks, global_map):
    """
        Does one iteration of the computation. To be called in the increasing continuous
        order of the integer "step" argument
        """
    # Prepare memory for the generated flock
    new_flock = agents.Flock(cfg.NumberOfBoids)
    new_flock.init_empty_array()

    events = {}

    # Transfer the iteration number
    iteration = np.uint16(step)
    intermediary_events.append(cl.enqueue_copy(queue, buffers["global_iteration"], iteration))

    # Transfer the random number
    random = np.float32(np.random.rand(1) * 2 * np.pi)
    intermediary_events.append(cl.enqueue_copy(queue, buffers["global_random"], random))

    # -------------------------------------------------------------------------

    # Example workgroup/workitem pair: 7 groups of 64 items (400 boids)
    events["k_agent_reynolds_rules13_preprocess"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_agent_reynolds_rules13_preprocess"],
        (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) *
             gpu_params["preferred_multiple"]),),
        (gpu_params["preferred_multiple"],), global_work_offset=None,
        wait_for=intermediary_events)

    # Example workgroup/workitem pair: 625 groups of 256 items (400x400 boids)
    events["k_agent_reynolds_rule2_preprocess"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_agent_reynolds_rule2_preprocess"],
        (int(np.ceil(np.square(cfg.NumberOfBoids) / gpu_params["max_work_group_size"]) *
             gpu_params["max_work_group_size"]),),
        (gpu_params["max_work_group_size"],), global_work_offset=None,
        wait_for=intermediary_events)

    # Example workgroup/workitem pair: 7 groups of 64 items (400 boids)
    events["k_agent_ai_and_sim"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_agent_ai_and_sim"],
        (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) *
             gpu_params["preferred_multiple"]),),
        (gpu_params["preferred_multiple"],), global_work_offset=None,
        wait_for=[events["k_agent_reynolds_rules13_preprocess"],
                  events["k_agent_reynolds_rule2_preprocess"]])

    # transfer device -> host -------------------------------------------------
    # Second parameter size defines transfer size
    events["transfer_flocks"] = cl.enqueue_copy(
        queue, new_flock.np_arrays, buffers["global_generated_flocks"],
        device_offset=step * cfg.NumberOfBoids * agents.Boid.arraySize,
        wait_for=[events["k_agent_ai_and_sim"]])
    if cfg.track_map_changes:
        events["transfer_map"] = cl.enqueue_copy(
            queue, global_map[step], buffers["global_map"],
            wait_for=[events["k_agent_ai_and_sim"]])
    else:
        events["transfer_map"] = None

    cl.wait_for_events([events["transfer_flocks"], events["transfer_map"]])
    # print(new_flock.np_arrays)
    flocks.append(new_flock)

    # TEST
    if cfg.debug_on:
        global_test = np.zeros(10).astype(np.float32)
        events["transfer_test"] = cl.enqueue_copy(queue, global_test, buffers["global_test"])
        events["transfer_test"].wait()
        s = "%3d " % step
        for value in global_test:
            s += "%9.3f " % value
        print(s)
Пример #31
0
def main(mname):

    # Readin the matrix
    data = np.load(mname)
    indptr = data['indptr']
    indices = data['indices']
    M = data['M']

    m = len(indptr) - 1
    nSmp = 1000
    M = np.tile(M, (1, nSmp, 1, 1))
    v = np.ones((m * 3, nSmp))
    y = np.zeros((m * 3, nSmp))

    # Get the ranks.
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print(rank)

    # Slice the data to be processed.
    srow = rank * int(m / size)
    erow = (rank + 1) * int(m / size)
    if rank == size - 1:
        erow = m
    lm = erow - srow

    tM = M[indptr[srow]:indptr[erow]]
    ty = y[srow * 3:erow * 3]
    tIndptr = indptr[srow:erow + 1] - indptr[srow]
    tIndices = indices[indptr[srow]:indptr[erow]]

    platforms = cl.get_platforms()

    devices = platforms[0].get_devices(cl.device_type.GPU)
    ndevices = len(devices)
    if ndevices < size:
        print('GPUs is not enough! Actural size: {}, need: {}'.format(
            ndevices, size))
        return

    device = devices[rank]
    context = cl.Context([device])
    queues = [cl.CommandQueue(context) for i in range(2)]

    # Create the buffers.
    mem_flags = cl.mem_flags
    indptr_buf = cl.Buffer(context,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=tIndptr)
    indices_buf = cl.Buffer(context,
                            mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                            hostbuf=tIndices)

    # Allocate the OpenCL source and result buffer memory objects on GPU device GMEM.
    matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY, tM.nbytes)
    vector_buf = cl.Buffer(context, mem_flags.READ_ONLY, v.nbytes)
    destination_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, ty.nbytes)

    # Allocate pinned source and result host buffers:
    #   Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
    pinnedM = cl.Buffer(context,
                        mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
                        tM.nbytes)
    pinnedV = cl.Buffer(context,
                        mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
                        v.nbytes)
    pinnedRes = cl.Buffer(context,
                          mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
                          ty.nbytes)

    # Get mapped pointers to pinned input host buffers.
    #   Note:  This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
    map_flags = cl.map_flags
    srcM, _eventSrcM = cl.enqueue_map_buffer(queues[0], pinnedM,
                                             map_flags.WRITE, 0, tM.shape,
                                             tM.dtype)
    srcV, _eventSrcV = cl.enqueue_map_buffer(queues[0], pinnedV,
                                             map_flags.WRITE, 0, v.shape,
                                             v.dtype)
    srcRes, _eventSrcRes = cl.enqueue_map_buffer(queues[0], pinnedRes,
                                                 map_flags.READ, 0, ty.shape,
                                                 ty.dtype)

    srcM[:, :, :, :] = tM
    srcV[:, :] = v

    halfSize = int(lm / 2)

    localWorkSize = 64
    num_compute_units = device.max_compute_units  # assumes all the devices have same number of computes unit.
    globalWorkSize = 8 * num_compute_units * localWorkSize
    print('gpu {} num of computing unites {}'.format(rank, num_compute_units))

    # Read and build the kernel.
    kernelsource = open("multiGPUsTest.cl").read()
    program = cl.Program(context, kernelsource).build()

    start = timer()

    for iloop in range(LOOP_COUNT):

        eventV = cl.enqueue_copy(queues[0], vector_buf, srcV)

        eventM0 = cl.enqueue_copy(queues[0], matrix_buf,
                                  srcM[:tIndptr[halfSize]])
        # Kernel.
        matrix_dot_vector_kernel_event0 = \
        program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,),
                                  np.int64(halfSize), np.int64(nSmp), np.int64(0),
                                  indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf,
                                  wait_for=[eventV, eventM0])

        eventM1 = cl.enqueue_copy(queues[1],
                                  matrix_buf,
                                  srcM[tIndptr[halfSize]:],
                                  device_offset=tIndptr[halfSize] * nSmp * 9 *
                                  8)
        # Kernel.
        matrix_dot_vector_kernel_event1 = \
        program.matrix_dot_vector(queues[1], (globalWorkSize,), (localWorkSize,),
                                  np.int64(lm), np.int64(nSmp), np.int64(halfSize),
                                  indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf,
                                  wait_for=[eventV, eventM1])

        ## Step #11. Move the kernel's output data to host memory.
        matrix_dot_vector_copy_event0 = \
        cl.enqueue_copy(queues[0], srcRes[:halfSize*3], destination_buf,
                        is_blocking=False,
                        wait_for=[matrix_dot_vector_kernel_event0])

        matrix_dot_vector_copy_event1 = \
        cl.enqueue_copy(queues[1], srcRes[halfSize*3:], destination_buf,
                        is_blocking=False,
                        wait_for=[matrix_dot_vector_kernel_event1],
                        device_offset=halfSize*3*nSmp*8)

        # matrix_dot_vector_copy_event0.wait()
        cl.wait_for_events(
            [matrix_dot_vector_copy_event0, matrix_dot_vector_copy_event1])

    end = timer()

    print('OK, \t\t\t time: {:10.5f} ms'.format(
        (end - start) / float(LOOP_COUNT) * 1000.0))
Пример #32
0
def wait():
    cl.wait_for_events(_events)
  def pairGroup(group, dirtyElements, pairDistanceLimit=None):
    group = list(group)
    startedDirty = list(dirtyElements)
    isDirty = array(dirtyElements, dtype=uint8)
      
    ## Create a distance matrix (woohoo, will fit into memory now :D)
    distanceMatrix, clDistanceMatrix = calcDistanceMatrix(group, doIncludeFunc=lambda i: not isDirty[i], pairDistanceLimit=pairDistanceLimit)
    distanceMatrixStride = len(group)
    distanceMatrixSize = len(group)
    
    # Got all data... pair!
    doPairing = True
    
    # Initialization of OpenCL buffers
    minPosition = ndarray((2), dtype=int32)
    minPosition[:] = 0
    clMinPosition = cl.Buffer(clContext, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=minPosition)
    
    clColBuffer =  cl.Buffer(clContext, cl.mem_flags.READ_WRITE, size=distanceMatrixStride * float32().nbytes)
    clRowBuffer =  cl.Buffer(clContext, cl.mem_flags.READ_WRITE, size=distanceMatrixStride * float32().nbytes)
    clDirtVector = cl.Buffer(clContext, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=isDirty)
    
    prevs = ()
    while doPairing:
      ## Find minimum in distance matrix
      
      # Python way
      #i_py, j_py = unravel_index(distanceMatrix.argmin(), distanceMatrix.shape)
      
      # OpenCL way
      # Create minimum vectors
      if len(prevs) > 0:
        cl.wait_for_events(prevs)
      prevs = \
        pairerProg.minCompressCol(clQueue, [distanceMatrixSize], None, int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, clColBuffer),
      
      # Compress these vectors to matrix offsets
      prevs = \
        pairerProg.reduceMinIndex(clQueue, [1], None, int32(distanceMatrixSize), clColBuffer, int32(0), clMinPosition, wait_for=prevs),
      prevs = \
        pairerProg.reduceMinRowIndex(clQueue, [1], None, int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, clMinPosition, wait_for=prevs),

      # Load indexes into Python memory
      cl.enqueue_copy(clQueue, minPosition, clMinPosition, is_blocking=True, wait_for=prevs)
      i, j = minPosition
      
      ## Do pairing
      i, j = min(i, j), max(i, j)
      
      # Track if this is a valid pairing with other bins
      doPairing = (i >= 0) and (not any([isDirty[x] for x in (i, j)]))
      
      # !!!Can perhaps merge more points by marking points as dirty if they cannot be matched!!! <- testing right now
      if doPairing:
        ## Adapt group to do the paring
        group[i] = [group[i], group[j]]
        group.pop(j)
        isDirty[i] = False
        isDirty = delete(isDirty, j)
        startedDirty.pop(j)

        ## Matrix update
        
        # Py way
        #minCol = hstack((distanceMatrix[[i],:].T, distanceMatrix[:,[i]], distanceMatrix[[j],:].T, distanceMatrix[:,[j]])).min(1)
        #distanceMatrix[[i],:] = array([hstack((array([inf] * (i + 1)), minCol[i + 1:]))])
        #distanceMatrix[:,[i]] = array([hstack((minCol[:i], array([inf] * (distanceMatrix.shape[0] - i))))]).T
        #distanceMatrix = delete(delete(distanceMatrix, (j,), 1), (j,), 0)
        
        # Cl way
        prevs = (pairerProg.minUnifyDistances(clQueue, [distanceMatrixSize], None, int32(i), int32(j), int32(distanceMatrixStride), clDistanceMatrix),)
        prevs = (pairerProg.deleteRowAndCol(clQueue, [distanceMatrixSize], None, int32(j), int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, wait_for=prevs),\
                 pairerProg.deleteBoolVectorElement(clQueue, [1], None, int32(j), int32(distanceMatrixSize), clDirtVector, wait_for=prevs))
        distanceMatrixSize -= 1
        
        doPairing = distanceMatrixSize > 1
      elif i < 0:
        pass # Nothing to do... nothing to match... quit!
      else:
        ## Mark pair as dirty
        isDirty[i] = True
        isDirty[j] = True
        
        ## Delete low value from matrix
        # Py way
        # - no py way yet - need to cross out a cross shape in matrix

        # Cl way
        prevs = (pairerProg.crossDirty(clQueue, [2], None, int32(i), int32(distanceMatrixStride), int32(distanceMatrixSize), clDistanceMatrix, clDirtVector, wait_for=prevs),)
        prevs = (pairerProg.crossDirty(clQueue, [2], None, int32(j), int32(distanceMatrixStride), int32(distanceMatrixSize), clDistanceMatrix, clDirtVector, wait_for=prevs),)

        doPairing = sum([1 for d in isDirty if not d]) > 1

      # Debug stuff
      #print "cl:",i, j, " py:",i_py, j_py
      #print distanceMatrix[i,j], distanceMatrix[i_py,j_py]
      #print distanceMatrix.argmin()

      #print "py:"
      #print distanceMatrix

      #distanceMatrixDeb = ndarray((distanceMatrixStride, distanceMatrixStride), dtype=float32)
      #cl.enqueue_copy(clQueue, distanceMatrixDeb, clDistanceMatrix, is_blocking=True, wait_for=prevs)
      #print "cl:"
      #print distanceMatrixDeb[:distanceMatrixSize,:distanceMatrixSize]
      
      #cl.enqueue_copy(clQueue, isDirty, clDirtVector, is_blocking=True, wait_for=prevs)
      #print "CopDirt",isDirty
    
    cl.wait_for_events(prevs)
    
    return [g for i, g in enumerate(group) if not startedDirty[i]]
Пример #34
0
def find_offsets(cq, finder, *args):
    cl.wait_for_events([finder.find_offsets(cq, *args)])
Пример #35
0
def test_wait_for_events(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue)
    cl.wait_for_events([evt1, evt2])
Пример #36
0
    def mark_targets(self, tree, peer_lists, lpot_source, target_status,
                     debug, wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.target_marker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32)
        found_target_close_to_panel.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice]
        sources = [
                axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
        tunnel_radius_by_source = (
                lpot_source._close_target_tunnel_radius("nsources")
                .with_queue(self.queue))

        # Target-marking algorithm (TGTMARK):
        #
        # (1) Use a space invader query to tag each leaf box that intersects with the
        # "near-source-detection tunnel" with the distance to the closest source.
        #
        # (2) Do an area query around all targets with the radius resulting
        # from the space invader query, enumerate sources in that vicinity.
        # If a source is found whose distance to the target is less than the
        # source's tunnel radius, mark that target as pending.
        # (or below: mark the source for refinement)

        # Note that this comment is referred to below by "TGTMARK". If you
        # remove this comment or change the algorithm here, make sure that
        # the reference below is still accurate.

        # Trade off for space-invaders vs directly tagging targets in
        # endangered boxes:
        #
        # (-) More complicated
        # (-) More actual work
        # (+) Taking the point of view of the targets could potentially lead to
        # more parallelism, if you think of the targets as unbounded while the
        # sources are fixed (which sort of makes sense, given that the number
        # of targets per box is not bounded).

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                sources,
                tunnel_radius_by_source,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        tunnel_radius_by_source = lpot_source._close_target_tunnel_radius("nsources")

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_target_slice.start,
                tree.sorted_target_ids,
                tunnel_radius_by_source,
                box_to_search_dist,
                target_status,
                found_target_close_to_panel,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            target_status.finish()
            # Marked target = 1, 0 otherwise
            marked_target_count = cl.array.sum(target_status).get()
            logger.debug("target association: {}/{} targets marked close to panels"
                         .format(marked_target_count, tree.nqbxtargets))

        cl.wait_for_events([evt])

        return (found_target_close_to_panel == 1).all().get()
Пример #37
0
    def find_centers(self,
                     places,
                     dofdesc,
                     tree,
                     peer_lists,
                     target_status,
                     target_flags,
                     target_assoc,
                     target_association_tolerance,
                     debug,
                     wait_for=None):
        from pytential import bind, sym
        ambient_dim = places.ambient_dim

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.center_finder(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, tree.particle_id_dtype,
            max_levels)

        if debug:
            target_status.finish()
            marked_target_count = int(cl.array.sum(target_status).get())

        # Perform a space invader query over the centers.
        center_slice = (
            tree.sorted_target_ids[tree.qbx_user_center_slice].with_queue(
                self.queue))
        centers = [
            axis.with_queue(self.queue)[center_slice] for axis in tree.sources
        ]
        expansion_radii_by_center = bind(
            places,
            sym.expansion_radii(ambient_dim,
                                granularity=sym.GRANULARITY_CENTER,
                                dofdesc=dofdesc))(self.array_context)
        expansion_radii_by_center_with_tolerance = flatten(
            expansion_radii_by_center * (1 + target_association_tolerance))

        # Idea:
        #
        # (1) Tag leaf boxes around centers with max distance to usable center.
        # (2) Area query from targets with those radii to find closest eligible
        # center in terms of relative distance.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
            self.queue,
            tree,
            centers,
            expansion_radii_by_center_with_tolerance,
            peer_lists,
            wait_for=wait_for)
        wait_for = [evt]

        def make_target_field(fill_val, dtype=tree.coord_dtype):
            arr = cl.array.empty(self.queue, tree.nqbxtargets, dtype)
            arr.fill(fill_val)
            wait_for.extend(arr.events)
            return arr

        target_to_center_plus = make_target_field(-1, np.int32)
        target_to_center_minus = make_target_field(-1, np.int32)

        min_dist_to_center_plus = make_target_field(np.inf)
        min_dist_to_center_minus = make_target_field(np.inf)

        min_rel_dist_to_center_plus = make_target_field(np.inf)
        min_rel_dist_to_center_minus = make_target_field(np.inf)

        evt = knl(*unwrap_args(
            tree, peer_lists, tree.box_to_qbx_center_starts,
            tree.box_to_qbx_center_lists, tree.qbx_user_center_slice.start,
            tree.qbx_user_target_slice.start, tree.sorted_target_ids,
            expansion_radii_by_center_with_tolerance, box_to_search_dist,
            target_flags, target_status, target_assoc.target_to_center,
            target_to_center_plus, target_to_center_minus,
            min_dist_to_center_plus, min_dist_to_center_minus,
            min_rel_dist_to_center_plus, min_rel_dist_to_center_minus,
            *tree.sources),
                  range=slice(tree.nqbxtargets),
                  queue=self.queue,
                  wait_for=wait_for)

        if debug:
            target_status.finish()
            # Associated target = 2, marked target = 1
            ntargets_associated = (int(cl.array.sum(target_status).get()) -
                                   marked_target_count)
            assert ntargets_associated >= 0
            logger.debug(
                "target association: {} targets were assigned centers".format(
                    ntargets_associated))

        cl.wait_for_events([evt])
Пример #38
0
    def mark_panels_for_refinement(self, tree, peer_lists, lpot_source,
                                   target_status, refine_flags, debug,
                                   wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.refiner_for_failed_target_association(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.user_source_ids[tree.qbx_user_source_slice]
        sources = [
                axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
        tunnel_radius_by_source = (
                lpot_source._close_target_tunnel_radius("nsources")
                .with_queue(self.queue))

        # See (TGTMARK) above for algorithm.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                sources,
                tunnel_radius_by_source,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_target_slice.start,
                tree.nqbxpanels,
                tree.sorted_target_ids,
                lpot_source._close_target_tunnel_radius("nsources"),
                target_status,
                box_to_search_dist,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            refine_flags.finish()
            # Marked panel = 1, 0 otherwise
            marked_panel_count = cl.array.sum(refine_flags).get()
            logger.debug("target association: {} panels flagged for refinement"
                         .format(marked_panel_count))

        cl.wait_for_events([evt])

        return (found_panel_to_refine == 1).all().get()
def gpu_generate_next_flock(step, queue, intermediary_events, kernels, gpu_params, buffers, flocks):
    """
        Does one iteration of the computation. To be called in the increasing continuous
        order of the integer "step" argument
        """
    # Prepare memory for the generated flock
    new_flock = {}
    new_flock["flock"] = agents.Flock(cfg.NumberOfBoids)
    new_flock["flock"].np_arrays = np.zeros(
        (cfg.NumberOfBoids, cfg.ARRDIM, cfg.Dimensions), dtype=np.float32)
    new_flock["predators"] = agents.Flock(cfg.NumberOfPredators)
    new_flock["predators"].np_arrays = np.zeros(
        (cfg.NumberOfPredators, cfg.ARRDIM, cfg.Dimensions), dtype=np.float32)

    events = {}

    # Transfer the iteration number
    iteration = np.uint16(step)
    intermediary_events.append(cl.enqueue_copy(queue, buffers["global_iteration"], iteration))

    # 7 groups of 64 items (400 boids)
    events["k_predator_ai_preprocess"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_predator_ai_preprocess"],
        (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),),
        (gpu_params["preferred_multiple"],), global_work_offset=None,
        wait_for=intermediary_events)

    # 1 group of 5 items (5 predators)
    events["k_predator_ai"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_predator_ai"],
        (cfg.NumberOfPredators,), (cfg.NumberOfPredators,), global_work_offset=None,
        wait_for=(events["k_predator_ai_preprocess"],))

    # transfer device -> host -------------------------------------------------
    # Second parameter size defines transfer size
    events["transfer_predators"] = cl.enqueue_copy(
        queue, new_flock["predators"].np_arrays, buffers["generated_predators"],
        device_offset=step * cfg.NumberOfPredators * cfg.ARRDIM *
        cfg.Dimensions * np.dtype(np.float32).itemsize,
        wait_for=(events["k_predator_ai"],))
    # -------------------------------------------------------------------------

    # 625 groups of 256 items (400x400 boids)
    events["k_agent_ai_preprocess"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_agent_ai_preprocess"],
        (int(np.ceil(np.square(cfg.NumberOfBoids) / gpu_params["max_work_group_size"]) * gpu_params[
            "max_work_group_size"]),),
        (gpu_params["max_work_group_size"],), global_work_offset=None,
        wait_for=(events["k_predator_ai"],))
    # 7 groups of 64 items (400 boids)
    events["k_agent_ai"] = cl.enqueue_nd_range_kernel(
        queue, kernels["k_agent_ai"],
        (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),),
        (gpu_params["preferred_multiple"],), global_work_offset=None,
        wait_for=(events["k_agent_ai_preprocess"],))

    # transfer device -> host -------------------------------------------------
    # Second parameter size defines transfer size
    events["transfer_flocks"] = cl.enqueue_copy(
        queue, new_flock["flock"].np_arrays, buffers["generated_flocks"],
        device_offset=step * cfg.NumberOfBoids * cfg.ARRDIM *
        cfg.Dimensions * np.dtype(np.float32).itemsize,
        wait_for=(events["k_agent_ai"],))

    cl.wait_for_events([events["transfer_predators"], events["transfer_flocks"]])
    flocks.append(new_flock)
Пример #40
0
 def finish(self):
     # undoc
     cl.wait_for_events(self.events)
     del self.events[:]
Пример #41
0
queue = cl.CommandQueue(
    context,
    dev,
    properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

source = '''
__kernel void hello_world(const int id) {
  printf(\"Hello world from id %d \\n", id);
}
'''

program = cl.Program(context, source)
program.build()
kernel = cl.Kernel(program, "hello_world")

events = []

for i in range(num_kernel):
    kernel.set_args(np.int32(i))
    if i == 0:
        event = cl.enqueue_nd_range_kernel(queue, kernel, [global_work_size],
                                           [local_work_size])
    else:
        event = cl.enqueue_nd_range_kernel(queue,
                                           kernel, [global_work_size],
                                           [local_work_size],
                                           wait_for=[events[-1]])
    events.append(event)

cl.wait_for_events(events)
Пример #42
0
def test_speed(rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32')
          for m, n in zip(ms, ns)]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(
        queue, alpha, clA, A_js, clX, X_js, beta, clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)
Пример #43
0
        # spMVOverlapping
        matrix_dot_vector_kernel_event0 = \
        program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,), np.int64(halfSize), np.int64(nSmp), np.int64(0), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventM0])

        eventM1 = cl.enqueue_copy(queues[1],
                                  matrix_buf,
                                  srcM[indptr[halfSize]:],
                                  is_blocking=False,
                                  device_offset=indptr[halfSize] * nSmp * 9 *
                                  8)
        # spMVOverlapping
        matrix_dot_vector_kernel_event1 = \
        program.matrix_dot_vector(queues[1], (globalWorkSize,), (localWorkSize,), np.int64(m), np.int64(nSmp), np.int64(halfSize), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventM1])

        ## Step #11. Move the kernel's output data to host memory.
        matrix_dot_vector_copy_event0 = \
        cl.enqueue_copy(queues[0], srcRes[:halfSize*3], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event0])
        matrix_dot_vector_copy_event1 = \
        cl.enqueue_copy(queues[1], srcRes[halfSize*3:], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event1], device_offset=halfSize*3*nSmp*8)

        cl.wait_for_events(
            [matrix_dot_vector_copy_event0, matrix_dot_vector_copy_event1])

    end = timer()

    print('OK, \t\t\t time: {:10.5f} ms'.format(
        (end - start) / float(LOOP_COUNT) * 1000.0))

    # print(srcRes)
    # print(srcM[2:])
Пример #44
0
def test_wait_for_events(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue)
    cl.wait_for_events([evt1, evt2])
Пример #45
0
def collide(cq, collider, *args):
    cl.wait_for_events([collider.get_collisions(cq, *args)])
Пример #46
0
 def elapsed(self):
     cl.wait_for_events([self.start_event, self.stop_event])
     return (
             self.stop_event.profile.end
             - self.start_event.profile.end) / SECONDS_PER_NANOSECOND
Пример #47
0
def nn3d(pts, values,
	minima=np.zeros((3,), dtype=np.float32),
	maxima=np.ones((3,), dtype=np.float32),
	res=np.ones((3,), dtype=np.int32) * 32, ):
	
	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)
	mf = cl.mem_flags

	pts2 = np.hstack((pts, np.reshape(values, (len(values),1))))
	print 'pts2:', pts2
	node = buildKDTree(pts2)
	flat = flattenKDTree(node)
	flat_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = flat)
	
	minima_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = minima)
	maxima_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = maxima)
	res_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = res)
	
	out = np.zeros(res, dtype=np.float32)
	out_g = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = out)
	
	# values_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = values.astype(np.float32))
	
	accum = np.zeros(res, dtype=np.float32)
	cnt = np.zeros(res, dtype=np.float32)
	
	accum_g = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=accum)
	cnt_g = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=cnt)
	
	prg = cl.Program(ctx, """
		void atomic_add_global(volatile __global float *source, const float operand) {
		
			// *source += operand;
			union {
				unsigned int intVal;
				float floatVal;
			} newVal;
			union {
				unsigned int intVal;
				float floatVal;
			} prevVal;

			do {
				prevVal.floatVal = *source;
				newVal.floatVal = prevVal.floatVal + operand;
			} while (atomic_cmpxchg((volatile global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
		}
		
		__global const float* closestPoint2(__global const float *flat_g, int cnt, float *pt) {
			float minDistSq = 1.0 / 0.0;
			int minIdx = -1;
			__global const float *p = flat_g;
		
			for (int i = 0; i < cnt; i++) {
				float x = *p++;
				float y = *p++;
				float z = *p++;
				float val = *p++; // skip
				
				float distSq = pow(x - pt[0], 2) + pow(y - pt[1], 2) + pow(z - pt[2], 2);
				
				if (distSq < minDistSq) {
					minDistSq = distSq;
					minIdx = i;
				}
			}
			
			return flat_g + minIdx * 4;
		}
		
		__global const float* closestPoint(__global const float *flat_g, float *pt) {
			
			__global const float *p = flat_g;
			int ax = (int) p[0];
			float split = p[1];
			int n = (int) p[2];
			int childCnt = (int) p[3];
			int nL = (int) p[4];
			int nR = (int) p[5];
			__global const float *current_best = 0;
			__global const float *new_best = 0;
			float best_dist;
			float dist;
			int crossBorder = 0;
			
			p += 6;
			
			for (;;) {
				if (pt[ax] <= split || crossBorder == -1) { // left
					if (nL >= 0) {
						new_best = closestPoint2(p, nL, pt);
					} else {
						new_best = closestPoint(p, pt);
					}
					
					dist = sqrt(pow(new_best[0] - pt[0], 2) +
						pow(new_best[1] - pt[1], 2) +
						pow(new_best[2] - pt[2], 2));
						
					if (current_best == 0 || dist < best_dist) {
						current_best = new_best;
						best_dist = dist;
					}
						
					if (dist + pt[ax] > split && crossBorder == 0) {
						crossBorder = 1;
					} else {
						return current_best;
					}
				}
				
				if (pt[ax] > split || crossBorder == 1) { // right
					if (nL >= 0) {
						p += nL * 4;
					} else {
						nL = (int) p[2];
						int childCntL = (int) p[3];
						
						p += childCntL * 6 + nL * 4;
					}
					
					if (nR >= 0) {
						new_best = closestPoint2(p, nR, pt);
					} else {
						new_best = closestPoint(p, pt);
					}
					
					dist = sqrt(pow(new_best[0] - pt[0], 2) +
						pow(new_best[1] - pt[1], 2) +
						pow(new_best[2] - pt[2], 2));
						
					if (current_best == 0 || dist < best_dist) {
						current_best = new_best;
						best_dist = dist;
					}
						
					if (pt[ax] - dist <= split && crossBorder == 0) {
						crossBorder = -1;
						p = flat_g + 6;
					} else {
						return current_best;
					}
				}
			}
		}
	
		__kernel void nn3d(__global const float *flat_g,
			__global const float *minima_g,
			__global const float *maxima_g,
			__global const int *res,
			__global float *out_g,
			__global float *accum_g,
			__global float *cnt_g,
			int ofs_x,
			int ofs_y) {
			
			float span[3] = {maxima_g[0] - minima_g[0],
				maxima_g[1] - minima_g[1],
				maxima_g[2] - minima_g[2]};
			
			int x = ofs_x + get_global_id(0);
			int y = ofs_y + get_global_id(1);
			int z = get_global_id(2);
			
			int ofs = (z * res[1] + y) * res[0] + x;
			// *(out_g + ofs) = (float)(x + y + z);
			
			// return;
			
			float pt[3];
			pt[0] = minima_g[0] + (maxima_g[0] - minima_g[0]) * x / ((float) res[0] - 1.0f);
			pt[1] = minima_g[1] + (maxima_g[1] - minima_g[1]) * y / ((float) res[1] - 1.0f);
			pt[2] = minima_g[2] + (maxima_g[2] - minima_g[2]) * z / ((float) res[2] - 1.0f);
			
			__global const float *closest = closestPoint(flat_g, pt);
			
			// *(out_g + ofs) = closest[3];
			
			// return;
			
			float radiusSq = pow(closest[0] - pt[0], 2) +
				pow(closest[1] - pt[1], 2) +
				pow(closest[2] - pt[2], 2);
			float radius = sqrt(radiusSq);
			float val = closest[3];
			
			int radius_x = ceil(radius * res[0] / (maxima_g[0] - minima_g[0]));
			int radius_y = ceil(radius * res[1] / (maxima_g[1] - minima_g[1]));
			int radius_z = ceil(radius * res[2] / (maxima_g[2] - minima_g[2]));
			
			for (int x2 = max(0, x - radius_x); x2 <= min(res[0] - 1, x + radius_x); x2++) {
				float x3 = minima_g[0] + span[0] * x2 / ((float) res[0] - 1.0f);
				float dxSq = pow(x3 - pt[0], 2);
				
				for (int y2 = max(0, y - radius_y); y2 <= min(res[1] - 1, y + radius_y); y2++) {
					float y3 = minima_g[1] + span[1] * y2 / ((float) res[1] - 1.0f);
					float dySq = pow(y3 - pt[1], 2);
					
					for (int z2 = max(0, z - radius_z); z2 <= min(res[2] - 1, z + radius_z); z2++) {
						float z3 = minima_g[2] + span[2] * z2 / ((float) res[2] - 1.0f);
						float dzSq = pow(z3 - pt[2], 2);
						
						float distSq = dxSq + dySq + dzSq;
						
						if (distSq <= radiusSq) {
							int ofs = (z2 * res[1] + y2) * res[0] + x2;
							
							atomic_add_global(accum_g + ofs, val);
							atomic_add_global(cnt_g + ofs, 1.0f);
						}
					}
				}
			}
		}
		""").build()
		
	for x in xrange(res[0]):
		for y in xrange(res[1]):
			print 'x:', x, 'y:', y
		
			ev = prg.nn3d(queue, [1, 1, out.shape[2]], None, flat_g, minima_g, maxima_g,
				res_g, out_g, accum_g, cnt_g, np.int32(x), np.int32(y))
			print 'ev:', ev
			# cl.enqueue_barrier(queue, wait_for=[ev])
			# ev.wait()
			cl.wait_for_events([ev])
			
		
	cl.enqueue_copy(queue, accum, accum_g)
	cl.enqueue_copy(queue, cnt, cnt_g)
	return (accum, cnt)

	# return out
Пример #48
0
 def event_waiter2(e, key):
     cl.wait_for_events([e])
     status[key] = True
Пример #49
0
def radix_sort(cq, sorter, *args):
    cl.wait_for_events([sorter.sort(cq, *args)])