Python enqueue_marker 예제들, pyopencl.enqueue_marker Python 예제들

예제 #1

0

파일 보기

    def run_aes32(self, str1, lens, crypted_data, printspeed=False):

        mf = cl.mem_flags  # opencl memflag enum

        npstr1 = np.array(str1, dtype=np.void)
        str1_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr1)

        strlen = np.int32(len(str1[0]))

        result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens)))
        result_s = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        start_event = cl.enqueue_marker(self.queue)

        finish_event = self.prg.substr(self.queue, npstr1.shape, None, str1_g,
                                       result_s, strlen)
        finish_event.wait()

        #end substr start aes
        mf = cl.mem_flags  # opencl memflag enum
        nAESblocks = np.int32(len(crypted_data) / 16)

        crypted_data = [crypted_data]  #*len(hashkey)
        cryp = np.array(crypted_data, dtype=np.void)
        data_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=cryp)

        #result = np.zeros(len(hashkey), dtype=pyopencl.cltypes.long)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code
        result = np.empty(npstr1.shape, dtype=cryp.dtype)
        result_g = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte
        gdim = (npstr1.shape[0] * nAESblocks, )
        start_event = cl.enqueue_marker(self.queue)

        finish_event = self.prg.func_pbkdf2(self.queue, gdim, None, result_s,
                                            data_g, result_g, nAESblocks)
        finish_event.wait()
        #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution

        #		if(printspeed):
        #			print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s")
        rt = cl.enqueue_copy(
            self.queue, result, result_g
        )  # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        rbs = []
        i = 0
        for rs in result:
            rbs.append(rs.tobytes())
            #print(rbs[i].hex())
            i += 1

        return rbs

예제 #2

0

파일 보기

파일: test_wrapper.py 프로젝트: MaybeS/pyopencl

def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier')
    queue = cl.CommandQueue(ctx)
    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])

예제 #3

0

파일 보기

파일: test_wrapper.py 프로젝트: python-jheengut/pyopencl

def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier')
    queue = cl.CommandQueue(ctx)
    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])

예제 #4

0

파일 보기

    def wrapper(self, insn, profile_data):
        if profile_data is None:
            return f(self, insn, profile_data)

        start = cl.enqueue_marker(self.array_context.queue)
        retval = f(self, insn, profile_data)
        end = cl.enqueue_marker(self.array_context.queue)
        profile_data\
                .setdefault(time_field_name, TimingFutureList())\
                .append(TimingFuture(start, end))

        return retval

예제 #5

0

파일 보기

    def map_profiled_operator_binding(self, expr, profile_data):
        if profile_data is None:
            return self.inner_mapper.map_operator_binding(expr)

        start = cl.enqueue_marker(self.array_context.queue)
        retval = self.inner_mapper.map_operator_binding(expr)
        end = cl.enqueue_marker(self.array_context.queue)
        time_field_name = "time_op_%s" % expr.op.mapper_method
        profile_data\
                .setdefault(time_field_name, TimingFutureList())\
                .append(TimingFuture(start, end))

        return retval

예제 #6

0

파일 보기

    def run_eqs(self, str1, str2, printspeed=False):
        mf = cl.mem_flags  # opencl memflag enum

        npstr1 = np.array(str1, dtype=np.void)
        str1_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr1)

        npstr2 = np.array(str2, dtype=np.void)
        str2_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr2)

        result = np.empty(npstr1.shape, dtype=[('a', 'V64'), ('b', np.bool)])
        result_g = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        #******************Call Kernel******************
        start_event = cl.enqueue_marker(self.queue)

        finish_event = self.prg.concat_str(self.queue, npstr1.shape, None,
                                           str1_g, str2_g, result_g)
        finish_event.wait()
        rt = cl.enqueue_copy(
            self.queue, result, result_g
        )  # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        rs = np.where(result == False)

        return rs

예제 #7

0

파일 보기

    def _write_time_shift(self, queues):
        """Estimate the time shift between devices with respect to a
        global clock. This is important for evaluating relative device
        runtimes with respect to each other.
        """
        # Get only a single command queue for a device on which we will
        # determine the zero time of a device.
        unique_queues = []
        devices = []
        for queue in queues:
            if queue.device not in devices:
                unique_queues.append(queue)
                devices.append(queue.device)

        starts = {}
        start = time.time()
        for i in range(len(devices)):
            starts[devices[i]] = cl.enqueue_marker(unique_queues[i])
        d_t = (time.time() - start) * q.s

        cl.wait_for_events(list(starts.values()))

        for device in starts:
            starts[device] = starts[device].profile.queued

        # Write the zero time for every device into the profiling file.
        self._profile_file.write("# device\tinitial_time\n")
        for device in starts:
            self._cldevices[device] = self._cldevice_next()
            self._profile_file.write("%d\t%d\n" %
                                     (self._cldevices[device], starts[device]))
        self._profile_file.write("# END_INIT_T0\n")
        self._profile_file.write("# Relative device timing error\n%g\n" %
                                 d_t.rescale(q.ns))
        self._profile_file.write("# END_INIT\n")

예제 #8

0

파일 보기

파일: opencl.py 프로젝트: qith/SQLCipher-Password-Cracker-OpenCL

    def run(self,password_start,password_step,printspeed=True):

        pwdim = (password_step,)# set a 1-dimension tuple to tell the runtime to generate a totalpws of kernel execution
        
        mf = cl.mem_flags# opencl memflag enum
        
        pass_g =  cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.array(password_start,dtype=np.uint64))
        salt_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pbkdf_salt)
        iv_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.aes_iv)
        data_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.encrypted_data)

        result = np.zeros(password_step, dtype=np.bool)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code
        result_g = cl.Buffer(self.ctx, mf.WRITE_ONLY, result.nbytes)# size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        # The total time GPU used can be indicated by measuring the finish_event-start_event
        #******************Call Kernel******************
        start_event=cl.enqueue_marker(self.queue)
        finish_event=self.prg.func_pbkdf2(self.queue, pwdim,(512,), pass_g, salt_g, iv_g, data_g, result_g)
        finish_event.wait()
        #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution

        if(printspeed):
            print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s")

        cl.enqueue_copy(self.queue, result, result_g)# copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        return np.transpose(np.nonzero(result))+password_start# the array number of nonzero value is written to a list added by password_start value

예제 #9

0

파일 보기

def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    # Still relevant on pocl 1.0RC1.
    _xfail_if_pocl(
            ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")

    queue = cl.CommandQueue(ctx)

    if queue._get_cl_version() >= (1, 2) and cl.get_cl_header_version() <= (1, 1):
        pytest.skip("CL impl version >= 1.2, header version <= 1.1--cannot be sure "
                "that clEnqueueWaitForEvents is implemented")

    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])

예제 #10

0

파일 보기

    def run_substr(self, str1, lens, printspeed=False):
        mf = cl.mem_flags  # opencl memflag enum

        npstr1 = np.array(str1, dtype=np.void)
        str1_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr1)

        strlen = np.int32(len(str1[0]))

        result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens)))
        result_g = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        #******************Call Kernel******************
        start_event = cl.enqueue_marker(self.queue)

        finish_event = self.prg.concat_str(self.queue, npstr1.shape, None,
                                           str1_g, result_g, strlen)
        finish_event.wait()
        rt = cl.enqueue_copy(
            self.queue, result, result_g
        )  # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        rbs = []
        i = 0
        for rs in result:
            rbs.append(rs.tobytes())
            #print(rbs[i].hex())
            i += 1

        return rbs

예제 #11

0

파일 보기

파일: profiling.py 프로젝트: ufo-kit/syris

    def _write_time_shift(self, queues):
        """Estimate the time shift between devices with respect to a
        global clock. This is important for evaluating relative device
        runtimes with respect to each other.
        """
        # Get only a single command queue for a device on which we will
        # determine the zero time of a device.
        unique_queues = []
        devices = []
        for queue in queues:
            if queue.device not in devices:
                unique_queues.append(queue)
                devices.append(queue.device)

        starts = {}
        start = time.time()
        for i in range(len(devices)):
            starts[devices[i]] = cl.enqueue_marker(unique_queues[i])
        d_t = (time.time() - start) * q.s

        cl.wait_for_events(starts.values())

        for device in starts:
            starts[device] = starts[device].profile.queued

        # Write the zero time for every device into the profiling file.
        self._profile_file.write("# device\tinitial_time\n")
        for device in starts:
            self._cldevices[device] = self._cldevice_next()
            self._profile_file.write("%d\t%d\n" % (self._cldevices[device],
                                                   starts[device]))
        self._profile_file.write("# END_INIT_T0\n")
        self._profile_file.write("# Relative device timing error\n%g\n" %
                                 d_t.rescale(q.ns))
        self._profile_file.write("# END_INIT\n")

예제 #12

0

파일 보기

def _enqueue_barrier(queue, wait_for):
    if queue.device.platform.name == "Portable Computing Language":
        # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList
        evt = cl.enqueue_marker(queue, wait_for=wait_for)
        queue.finish()
        return evt
    else:
        return cl.enqueue_barrier(queue, wait_for=wait_for)

예제 #13

0

파일 보기

def _enqueue_barrier(queue, wait_for):
    if queue.device.platform.name == "Portable Computing Language":
        # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList
        evt = cl.enqueue_marker(queue, wait_for=wait_for)
        queue.finish()
        return evt
    else:
        return cl.enqueue_barrier(queue, wait_for=wait_for)

예제 #14

0

파일 보기

파일: test_wrapper.py 프로젝트: MaybeS/pyopencl

def test_int_ptr(ctx_factory):
    def do_test(obj):
        new_obj = type(obj).from_int_ptr(obj.int_ptr)
        assert obj == new_obj
        assert type(obj) is type(new_obj)

    ctx = ctx_factory()
    device, = ctx.devices
    platform = device.platform
    do_test(device)
    do_test(platform)
    do_test(ctx)

    queue = cl.CommandQueue(ctx)
    do_test(queue)

    evt = cl.enqueue_marker(queue)
    do_test(evt)

    prg = cl.Program(ctx, """
        __kernel void sum(__global float *a)
        { a[get_global_id(0)] *= 2; }
        """).build()

    do_test(prg)
    do_test(prg.sum)

    n = 2000
    a_buf = cl.Buffer(ctx, 0, n*4)
    do_test(a_buf)

    # crashes on intel...
    # and pocl does not support CL_ADDRESS_CLAMP
    if device.image_support and platform.vendor not in [
            "Intel(R) Corporation",
            "The pocl project",
            ]:
        smp = cl.Sampler(ctx, False,
                cl.addressing_mode.CLAMP,
                cl.filter_mode.NEAREST)
        do_test(smp)

        img_format = cl.get_supported_image_formats(
                ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0]

        img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256))
        do_test(img)

예제 #15

0

파일 보기

파일: test_wrapper.py 프로젝트: python-jheengut/pyopencl

def test_int_ptr(ctx_factory):
    def do_test(obj):
        new_obj = type(obj).from_int_ptr(obj.int_ptr)
        assert obj == new_obj
        assert type(obj) is type(new_obj)

    ctx = ctx_factory()
    device, = ctx.devices
    platform = device.platform
    do_test(device)
    do_test(platform)
    do_test(ctx)

    queue = cl.CommandQueue(ctx)
    do_test(queue)

    evt = cl.enqueue_marker(queue)
    do_test(evt)

    prg = cl.Program(
        ctx, """
        __kernel void sum(__global float *a)
        { a[get_global_id(0)] *= 2; }
        """).build()

    do_test(prg)
    do_test(prg.sum)

    n = 2000
    a_buf = cl.Buffer(ctx, 0, n * 4)
    do_test(a_buf)

    # crashes on intel...
    # and pocl does not support CL_ADDRESS_CLAMP
    if device.image_support and platform.vendor not in [
            "Intel(R) Corporation",
            "The pocl project",
    ]:
        smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP,
                         cl.filter_mode.NEAREST)
        do_test(smp)

        img_format = cl.get_supported_image_formats(
            ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0]

        img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256))
        do_test(img)

예제 #16

0

파일 보기

    def run_concat(self, str1, str2, printspeed=False):
        mf = cl.mem_flags  # opencl memflag enum

        npstr1 = np.array(str1, dtype=np.void)
        str1_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr1)

        npstr2 = np.array(str2, dtype=np.void)
        str2_g = cl.Buffer(self.ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=npstr2)

        result = np.empty(npstr1.shape, dtype=np.dtype('V64'))
        result_g = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        #******************Call Kernel******************
        start_event = cl.enqueue_marker(self.queue)

        finish_event = self.prg.concat_str(self.queue, npstr1.shape, None,
                                           str1_g, str2_g, result_g)
        finish_event.wait()
        #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution

        #		if(printspeed):
        #			print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s")
        rt = cl.enqueue_copy(
            self.queue, result, result_g
        )  # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        rbs = []
        i = 0
        for rs in result:
            rbs.append(rs.tobytes())
            #print(rbs[i].hex())
            i += 1

        return rbs

예제 #17

0

파일 보기

def auto_test_vs_ref(ref_knl,
                     ctx,
                     test_knl=None,
                     op_count=[],
                     op_label=[],
                     parameters={},
                     print_ref_code=False,
                     print_code=True,
                     warmup_rounds=2,
                     dump_binary=False,
                     fills_entire_output=None,
                     do_check=True,
                     check_result=None,
                     max_test_kernel_count=1,
                     quiet=False,
                     blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                         "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

    from loopy.compiled import CompiledKernel
    from loopy.target.execution import get_highlighted_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated",
             DeprecationWarning,
             stacklevel=2)

    # {{{ compile and run reference code

    from loopy.type_inference import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    from loopy.kernel.data import ImageArg
    need_ref_image_support = any(
        isinstance(arg, ImageArg) for arg in ref_knl.args)

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors,
                                                  need_ref_image_support):

        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(
            ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("{} (ref): trying {} for the reference calculation".format(
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75 * "-")
            print("Reference Code:")
            print(75 * "-")
            print(get_highlighted_code(ref_compiled.get_code()))
            print(75 * "-")

        ref_kernel_info = ref_compiled.kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75 * "-",
                    "On %s:" % dev, 75 * "-",
                    traceback.format_exc(), 75 * "-"
                ]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("{} (ref): using {} for the reference calculation".format(
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop - ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9 * (ref_evt.profile.END -
                                    ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                         "reference computation.\n"
                         "These errors were encountered:\n" +
                         "\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    from loopy.kernel import KernelState
    from loopy.target.pyopencl import PyOpenCLTarget
    if test_knl.state not in [
            KernelState.PREPROCESSED, KernelState.LINEARIZED
    ]:
        if isinstance(test_knl.target, PyOpenCLTarget):
            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))

        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.type_inference import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        kernel_info = compiled.kernel_info(frozenset())

        args = make_args(kernel, kernel_info.implemented_data_info, queue,
                         ref_arg_data, parameters)

        args["out_host"] = False

        if not quiet:
            print(75 * "-")
            print("Kernel #%d:" % i)
            print(75 * "-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75 * "-")
            if dump_binary:
                # {{{ find cl program

                for name in dir(kernel_info.cl_kernels):
                    if name.startswith("__"):
                        continue
                    cl_kernel = getattr(kernel_info.cl_kernels, name)
                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
                    break
                else:
                    assert False, "could not find cl_program"

                # }}}

                print(type(cl_program))
                if hasattr(cl_program, "binaries"):
                    print(cl_program.binaries[0])

                print(75 * "-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                        arg_desc.ref_storage_array.get(),
                        shape=arg_desc.ref_shape,
                        strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                        arg_desc.test_storage_array.get(),
                        shape=arg_desc.test_shape,
                        strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = max(warmup_rounds, 1)

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9 * evt_end.profile.START -
                                         1e-9 * evt_start.profile.START) /
                                        timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time - start_time) / timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl)

        if not quiet:

            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                  "(%d rounds)%s" %
                  (format_float_or_none(elapsed_event),
                   format_float_or_none(elapsed_event_marker),
                   format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: {:g} s event, {:g} s wall{}".format(
                    ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict

예제 #18

0

파일 보기

파일: backend_opencl.py 프로젝트: nwoznica/sailfish

 def make_event(self, stream, timing=False):
     return EventWrapper(cl.enqueue_marker(stream.queue))

예제 #19

0

파일 보기

파일: backend_opencl.py 프로젝트: mjanusz/sailfish

 def wait_for_event(self, event):
     cl.enqueue_marker(self.queue, [event.event])

예제 #20

0

파일 보기

파일: backend_opencl.py 프로젝트: zliu72/sailfish

 def make_event(self, stream, timing=False):
     return EventWrapper(cl.enqueue_marker(stream.queue))

예제 #21

0

파일 보기

파일: backend_opencl.py 프로젝트: zliu72/sailfish

 def wait_for_event(self, event):
     cl.enqueue_marker(self.queue, [event.event])

예제 #22

0

파일 보기

    def run_aes32_concat(self,
                         str1,
                         lens,
                         crypted_data1,
                         crypted_data2,
                         printspeed=False):
        def aes32(npstr1, lens, crypted):
            mf = cl.mem_flags  # opencl memflag enum
            str1_g = cl.Buffer(self.ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=npstr1)
            result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens)))
            result_s = cl.Buffer(
                self.ctx, mf.WRITE_ONLY, result.nbytes
            )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

            finish_event = self.prg.substr(self.queue, npstr1.shape, None,
                                           str1_g, result_s, strlen)
            finish_event.wait()

            #end substr start aes
            mf = cl.mem_flags  # opencl memflag enum

            data_g = cl.Buffer(self.ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=crypted)

            #result = np.zeros(len(hashkey), dtype=pyopencl.cltypes.long)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code
            result = np.empty(npstr1.shape, dtype=crypted.dtype)
            result_g = cl.Buffer(
                self.ctx, mf.WRITE_ONLY, result.nbytes
            )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte
            gdim = (npstr1.shape[0] * nAESblocks, )

            finish_event = self.prg.func_pbkdf2(self.queue, gdim, None,
                                                result_s, data_g, result_g,
                                                nAESblocks)
            finish_event.wait()

        #******************End function Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution
        mf = cl.mem_flags
        start_event = cl.enqueue_marker(self.queue)
        npstr1 = np.array(str1, dtype=np.void)

        strlen = np.int32(len(npstr1[0]))
        nAESblocks = np.int32(len(crypted_data1) / 16)

        crypted_data = [crypted_data1]  #*len(hashkey)
        cryp1 = np.array(crypted_data1, dtype=np.void)
        result_g1 = aes32(npstr1, lens, cryp1)

        nAESblocks = np.int32(len(crypted_data2) / 16)

        crypted_data = [crypted_data2]  #*len(hashkey)
        cryp2 = np.array(crypted_data2, dtype=np.void)
        result_g2 = aes32(npstr1, lens, cryp2)

        result = np.empty(npstr1.shape, dtype=np.dtype('V64'))
        result_g = cl.Buffer(
            self.ctx, mf.WRITE_ONLY, result.nbytes
        )  # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte

        finish_event = self.prg.concat_str(self.queue, npstr1.shape, None,
                                           result_g1, result_g2, result_g)
        finish_event.wait()

        rt = cl.enqueue_copy(
            self.queue, result, result_g
        )  # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte)

        rbs = []
        i = 0
        for rs in result:
            rbs.append(rs.tobytes())
            #print(rbs[i].hex())
            i += 1

        return rbs

예제 #23

0

파일 보기

파일: test_wrapper.py 프로젝트: python-jheengut/pyopencl

def test_wait_for_events(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue)
    cl.wait_for_events([evt1, evt2])

예제 #24

0

파일 보기

파일: bitonic_sort.py 프로젝트: zhangjunda23/pyopencl

    def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0):
        """
        :arg arr: the array to be sorted. Will be overwritten with the sorted array.
        :arg idx: an array of indices to be tracked along with the sorting of *arr*
        :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue
            if None
        :arg wait_for: a list of :class:`pyopencl.Event` instances or None
        :arg axis: the axis of the array by which to sort

        :returns: a tuple (sorted_array, event)
        """

        if queue is None:
            queue = arr.queue

        if wait_for is None:
            wait_for = []
        wait_for = wait_for + arr.events

        last_evt = cl.enqueue_marker(queue, wait_for=wait_for)

        if arr.shape[axis] == 0:
            return arr, last_evt

        if not _is_power_of_2(arr.shape[axis]):
            raise ValueError("sorted array axis length must be a power of 2")

        if idx is None:
            argsort = 0
        else:
            argsort = 1

        run_queue = self.sort_b_prepare_wl(
            argsort, arr.dtype, idx.dtype if idx is not None else None,
            arr.shape, axis)

        knl, nt, wg, aux = run_queue[0]

        if idx is not None:
            if aux:
                last_evt = knl(queue, (nt, ),
                               wg,
                               arr.data,
                               idx.data,
                               cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] *
                                              arr.dtype.itemsize),
                               cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] *
                                              idx.dtype.itemsize),
                               wait_for=[last_evt])
            for knl, nt, wg, _ in run_queue[1:]:
                last_evt = knl(queue, (nt, ),
                               wg,
                               arr.data,
                               idx.data,
                               wait_for=[last_evt])

        else:
            if aux:
                last_evt = knl(queue, (nt, ),
                               wg,
                               arr.data,
                               cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] *
                                              4 * arr.dtype.itemsize),
                               wait_for=[last_evt])
            for knl, nt, wg, _ in run_queue[1:]:
                last_evt = knl(queue, (nt, ),
                               wg,
                               arr.data,
                               wait_for=[last_evt])

        return arr, last_evt

예제 #25

0

파일 보기

파일: auto_test.py 프로젝트: cmsquared/loopy

def auto_test_vs_ref(
        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
        print_ref_code=False, print_code=True, warmup_rounds=2,
        dump_binary=False,
        fills_entire_output=None, do_check=True, check_result=None,
        max_test_kernel_count=1,
        quiet=False, blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

    from loopy.compiled import CompiledKernel, get_highlighted_cl_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2)

    # {{{ compile and run reference code

    from loopy.preprocess import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(ref_ctx,
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("%s (ref): trying %s for the reference calculation" % (
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75*"-")
            print("Reference Code:")
            print(75*"-")
            print(get_highlighted_cl_code(ref_compiled.code))
            print(75*"-")

        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_cl_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75*"-",
                    "On %s:" % dev,
                    75*"-",
                    traceback.format_exc(),
                    75*"-"]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("%s (ref): using %s for the reference calculation" % (
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop-ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                "reference computation.\n"
                "These errors were encountered:\n"+"\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)

    args = None
    from loopy.kernel import kernel_state
    if test_knl.state not in [
            kernel_state.PREPROCESSED,
            kernel_state.SCHEDULED]:
        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.preprocess import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        if args is None:
            cl_kernel_info = compiled.cl_kernel_info(frozenset())

            args = make_args(kernel,
                    cl_kernel_info.implemented_data_info,
                    queue, ref_arg_data, parameters)
        args["out_host"] = False

        if not quiet:
            print(75*"-")
            print("Kernel #%d:" % i)
            print(75*"-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75*"-")
            if dump_binary:
                print(type(compiled.cl_program))
                print(compiled.cl_program.binaries[0])
                print(75*"-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                            arg_desc.ref_storage_array.get(),
                            shape=arg_desc.ref_shape,
                            strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                            arg_desc.test_storage_array.get(),
                            shape=arg_desc.test_shape,
                            strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = warmup_rounds

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9*evt_end.profile.START
                            - 1e-9*evt_start.profile.START)
                        / timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time-start_time)/timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)

        if not quiet:
            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                    "(%d rounds)%s" % (
                        format_float_or_none(elapsed_event),
                        format_float_or_none(elapsed_event_marker),
                        format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: %g s event, %g s wall%s" % (
                        ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict

예제 #26

0

파일 보기

파일: 322-get_dt_gpu.py 프로젝트: xj361685640/fdtd_accelerate

#!/usr/bin/env python

import pyopencl as cl
from fdtd3d import Fdtd3d
from datetime import datetime
import sys

nx = int(sys.argv[1])
tmax = int(sys.argv[2])
ny, nz = nx, nx

s = Fdtd3d(nx, ny, nz, target_device='gpu0', print_verbose=False)
#ez = s.eh_fieldss[0][2]
#ez[:] = np.random.rand(nx,nx,nx).astype(np.float32)
#print ez.shape

t0 = datetime.now()
for tstep in xrange(1, tmax + 1):
    s.update_h()
    s.update_e()
    #ez[2*nx/3,ny/2,:] += np.sin(0.1*tstep)

cl.enqueue_marker(s.queues[0]).wait()
dt0 = datetime.now() - t0
dt = dt0.seconds + dt0.microseconds * 1e-6

print dt

예제 #27

0

파일 보기

파일: opencl.py 프로젝트: richarms/katsdpsigproc

 def enqueue_marker(self) -> Event:
     return Event(pyopencl.enqueue_marker(self._pyopencl_command_queue))

예제 #28

0

파일 보기

파일: bitonic_sort.py 프로젝트: simudream/pyopencl

    def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0):
        """
        :arg arr: the array to be sorted. Will be overwritten with the sorted array.
        :arg idx: an array of indices to be tracked along with the sorting of *arr*
        :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue
            if None
        :arg wait_for: a list of :class:`pyopencl.Event` instances or None
        :arg axis: the axis of the array by which to sort

        :returns: a tuple (sorted_array, event)
        """

        if queue is None:
            queue = arr.queue

        if wait_for is None:
            wait_for = []
        wait_for = wait_for + arr.events

        last_evt = cl.enqueue_marker(queue, wait_for=wait_for)

        if arr.shape[axis] == 0:
            return arr, last_evt

        if not _is_power_of_2(arr.shape[axis]):
            raise ValueError("sorted array axis length must be a power of 2")

        if idx is None:
            argsort = 0
        else:
            argsort = 1

        run_queue = self.sort_b_prepare_wl(argsort, arr.dtype, idx.dtype if idx is not None else None, arr.shape, axis)

        knl, nt, wg, aux = run_queue[0]

        if idx is not None:
            if aux:
                last_evt = knl(
                    queue,
                    (nt,),
                    wg,
                    arr.data,
                    idx.data,
                    cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * arr.dtype.itemsize),
                    cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * idx.dtype.itemsize),
                    wait_for=[last_evt],
                )
            for knl, nt, wg, _ in run_queue[1:]:
                last_evt = knl(queue, (nt,), wg, arr.data, idx.data, wait_for=[last_evt])

        else:
            if aux:
                last_evt = knl(
                    queue,
                    (nt,),
                    wg,
                    arr.data,
                    cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * 4 * arr.dtype.itemsize),
                    wait_for=[last_evt],
                )
            for knl, nt, wg, _ in run_queue[1:]:
                last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt])

        return arr, last_evt

예제 #29

0

파일 보기

파일: 322-get_dt_gpu.py 프로젝트: wbkifun/fdtd_accelerate

#!/usr/bin/env python

import pyopencl as cl
from fdtd3d import Fdtd3d
from datetime import datetime
import sys

nx = int(sys.argv[1])
tmax = int(sys.argv[2])
ny, nz = nx, nx

s = Fdtd3d(nx, ny, nz, target_device='gpu0', print_verbose=False)
#ez = s.eh_fieldss[0][2]
#ez[:] = np.random.rand(nx,nx,nx).astype(np.float32)
#print ez.shape

t0 = datetime.now()
for tstep in xrange(1, tmax+1):
	s.update_h()
	s.update_e()
	#ez[2*nx/3,ny/2,:] += np.sin(0.1*tstep)

cl.enqueue_marker(s.queues[0]).wait()
dt0 = datetime.now() - t0
dt = dt0.seconds + dt0.microseconds * 1e-6

print dt

예제 #30

0

파일 보기

from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS

MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32)

ctx = cl.create_some_context()
queue_instruction = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
queue_data = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

mf = cl.mem_flags

MT_state_buf = cl.Buffer(ctx, mf.WRITE_ONLY, SIZE * MT_N * 4)
MT_state_res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, MT_state_result.nbytes)

prg = cl.Program(ctx, gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)).build()
z = cl.enqueue_marker(queue_instruction)

zzz = time.time()
instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(0), MT_state_buf, MT_state_res_buf)#, g_times_l=True)
data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,])

for i in xrange(TEST_ITERATIONS):#2**31 / SIZE):
    instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, wait_for=[data_event,])#, g_times_l=True)
    data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,])
    data_event.wait()
        #for row in (tmp for tmp in MT_state_result[0]):
        #    f.write('{0}\n'.format(row))


z2 = cl.enqueue_marker(queue_instruction)
z2.wait()

예제 #31

0

파일 보기

파일: test_wrapper.py 프로젝트: MaybeS/pyopencl

def test_wait_for_events(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue)
    cl.wait_for_events([evt1, evt2])