def run_aes32(self, str1, lens, crypted_data, printspeed=False): mf = cl.mem_flags # opencl memflag enum npstr1 = np.array(str1, dtype=np.void) str1_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr1) strlen = np.int32(len(str1[0])) result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens))) result_s = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte start_event = cl.enqueue_marker(self.queue) finish_event = self.prg.substr(self.queue, npstr1.shape, None, str1_g, result_s, strlen) finish_event.wait() #end substr start aes mf = cl.mem_flags # opencl memflag enum nAESblocks = np.int32(len(crypted_data) / 16) crypted_data = [crypted_data] #*len(hashkey) cryp = np.array(crypted_data, dtype=np.void) data_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cryp) #result = np.zeros(len(hashkey), dtype=pyopencl.cltypes.long)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code result = np.empty(npstr1.shape, dtype=cryp.dtype) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte gdim = (npstr1.shape[0] * nAESblocks, ) start_event = cl.enqueue_marker(self.queue) finish_event = self.prg.func_pbkdf2(self.queue, gdim, None, result_s, data_g, result_g, nAESblocks) finish_event.wait() #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution # if(printspeed): # print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s") rt = cl.enqueue_copy( self.queue, result, result_g ) # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) rbs = [] i = 0 for rs in result: rbs.append(rs.tobytes()) #print(rbs[i].hex()) i += 1 return rbs
def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier') queue = cl.CommandQueue(ctx) cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def wrapper(self, insn, profile_data): if profile_data is None: return f(self, insn, profile_data) start = cl.enqueue_marker(self.array_context.queue) retval = f(self, insn, profile_data) end = cl.enqueue_marker(self.array_context.queue) profile_data\ .setdefault(time_field_name, TimingFutureList())\ .append(TimingFuture(start, end)) return retval
def map_profiled_operator_binding(self, expr, profile_data): if profile_data is None: return self.inner_mapper.map_operator_binding(expr) start = cl.enqueue_marker(self.array_context.queue) retval = self.inner_mapper.map_operator_binding(expr) end = cl.enqueue_marker(self.array_context.queue) time_field_name = "time_op_%s" % expr.op.mapper_method profile_data\ .setdefault(time_field_name, TimingFutureList())\ .append(TimingFuture(start, end)) return retval
def run_eqs(self, str1, str2, printspeed=False): mf = cl.mem_flags # opencl memflag enum npstr1 = np.array(str1, dtype=np.void) str1_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr1) npstr2 = np.array(str2, dtype=np.void) str2_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr2) result = np.empty(npstr1.shape, dtype=[('a', 'V64'), ('b', np.bool)]) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte #******************Call Kernel****************** start_event = cl.enqueue_marker(self.queue) finish_event = self.prg.concat_str(self.queue, npstr1.shape, None, str1_g, str2_g, result_g) finish_event.wait() rt = cl.enqueue_copy( self.queue, result, result_g ) # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) rs = np.where(result == False) return rs
def _write_time_shift(self, queues): """Estimate the time shift between devices with respect to a global clock. This is important for evaluating relative device runtimes with respect to each other. """ # Get only a single command queue for a device on which we will # determine the zero time of a device. unique_queues = [] devices = [] for queue in queues: if queue.device not in devices: unique_queues.append(queue) devices.append(queue.device) starts = {} start = time.time() for i in range(len(devices)): starts[devices[i]] = cl.enqueue_marker(unique_queues[i]) d_t = (time.time() - start) * q.s cl.wait_for_events(list(starts.values())) for device in starts: starts[device] = starts[device].profile.queued # Write the zero time for every device into the profiling file. self._profile_file.write("# device\tinitial_time\n") for device in starts: self._cldevices[device] = self._cldevice_next() self._profile_file.write("%d\t%d\n" % (self._cldevices[device], starts[device])) self._profile_file.write("# END_INIT_T0\n") self._profile_file.write("# Relative device timing error\n%g\n" % d_t.rescale(q.ns)) self._profile_file.write("# END_INIT\n")
def run(self,password_start,password_step,printspeed=True): pwdim = (password_step,)# set a 1-dimension tuple to tell the runtime to generate a totalpws of kernel execution mf = cl.mem_flags# opencl memflag enum pass_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.array(password_start,dtype=np.uint64)) salt_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pbkdf_salt) iv_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.aes_iv) data_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.encrypted_data) result = np.zeros(password_step, dtype=np.bool)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code result_g = cl.Buffer(self.ctx, mf.WRITE_ONLY, result.nbytes)# size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte # The total time GPU used can be indicated by measuring the finish_event-start_event #******************Call Kernel****************** start_event=cl.enqueue_marker(self.queue) finish_event=self.prg.func_pbkdf2(self.queue, pwdim,(512,), pass_g, salt_g, iv_g, data_g, result_g) finish_event.wait() #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution if(printspeed): print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s") cl.enqueue_copy(self.queue, result, result_g)# copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) return np.transpose(np.nonzero(result))+password_start# the array number of nonzero value is written to a list added by password_start value
def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() # Still relevant on pocl 1.0RC1. _xfail_if_pocl( ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier") queue = cl.CommandQueue(ctx) if queue._get_cl_version() >= (1, 2) and cl.get_cl_header_version() <= (1, 1): pytest.skip("CL impl version >= 1.2, header version <= 1.1--cannot be sure " "that clEnqueueWaitForEvents is implemented") cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def run_substr(self, str1, lens, printspeed=False): mf = cl.mem_flags # opencl memflag enum npstr1 = np.array(str1, dtype=np.void) str1_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr1) strlen = np.int32(len(str1[0])) result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens))) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte #******************Call Kernel****************** start_event = cl.enqueue_marker(self.queue) finish_event = self.prg.concat_str(self.queue, npstr1.shape, None, str1_g, result_g, strlen) finish_event.wait() rt = cl.enqueue_copy( self.queue, result, result_g ) # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) rbs = [] i = 0 for rs in result: rbs.append(rs.tobytes()) #print(rbs[i].hex()) i += 1 return rbs
def _write_time_shift(self, queues): """Estimate the time shift between devices with respect to a global clock. This is important for evaluating relative device runtimes with respect to each other. """ # Get only a single command queue for a device on which we will # determine the zero time of a device. unique_queues = [] devices = [] for queue in queues: if queue.device not in devices: unique_queues.append(queue) devices.append(queue.device) starts = {} start = time.time() for i in range(len(devices)): starts[devices[i]] = cl.enqueue_marker(unique_queues[i]) d_t = (time.time() - start) * q.s cl.wait_for_events(starts.values()) for device in starts: starts[device] = starts[device].profile.queued # Write the zero time for every device into the profiling file. self._profile_file.write("# device\tinitial_time\n") for device in starts: self._cldevices[device] = self._cldevice_next() self._profile_file.write("%d\t%d\n" % (self._cldevices[device], starts[device])) self._profile_file.write("# END_INIT_T0\n") self._profile_file.write("# Relative device timing error\n%g\n" % d_t.rescale(q.ns)) self._profile_file.write("# END_INIT\n")
def _enqueue_barrier(queue, wait_for): if queue.device.platform.name == "Portable Computing Language": # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList evt = cl.enqueue_marker(queue, wait_for=wait_for) queue.finish() return evt else: return cl.enqueue_barrier(queue, wait_for=wait_for)
def test_int_ptr(ctx_factory): def do_test(obj): new_obj = type(obj).from_int_ptr(obj.int_ptr) assert obj == new_obj assert type(obj) is type(new_obj) ctx = ctx_factory() device, = ctx.devices platform = device.platform do_test(device) do_test(platform) do_test(ctx) queue = cl.CommandQueue(ctx) do_test(queue) evt = cl.enqueue_marker(queue) do_test(evt) prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg) do_test(prg.sum) n = 2000 a_buf = cl.Buffer(ctx, 0, n*4) do_test(a_buf) # crashes on intel... # and pocl does not support CL_ADDRESS_CLAMP if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) do_test(img)
def test_int_ptr(ctx_factory): def do_test(obj): new_obj = type(obj).from_int_ptr(obj.int_ptr) assert obj == new_obj assert type(obj) is type(new_obj) ctx = ctx_factory() device, = ctx.devices platform = device.platform do_test(device) do_test(platform) do_test(ctx) queue = cl.CommandQueue(ctx) do_test(queue) evt = cl.enqueue_marker(queue) do_test(evt) prg = cl.Program( ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg) do_test(prg.sum) n = 2000 a_buf = cl.Buffer(ctx, 0, n * 4) do_test(a_buf) # crashes on intel... # and pocl does not support CL_ADDRESS_CLAMP if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) do_test(img)
def run_concat(self, str1, str2, printspeed=False): mf = cl.mem_flags # opencl memflag enum npstr1 = np.array(str1, dtype=np.void) str1_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr1) npstr2 = np.array(str2, dtype=np.void) str2_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr2) result = np.empty(npstr1.shape, dtype=np.dtype('V64')) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte #******************Call Kernel****************** start_event = cl.enqueue_marker(self.queue) finish_event = self.prg.concat_str(self.queue, npstr1.shape, None, str1_g, str2_g, result_g) finish_event.wait() #******************Call Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution # if(printspeed): # print("OpenCL Speed: "+str(password_step/1e-9/(finish_event.profile.END-start_event.profile.START)/1000)+" K passphrase/s") rt = cl.enqueue_copy( self.queue, result, result_g ) # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) rbs = [] i = 0 for rs in result: rbs.append(rs.tobytes()) #print(rbs[i].hex()) i += 1 return rbs
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg need_ref_image_support = any( isinstance(arg, ImageArg) for arg in ref_knl.args) for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_ref_image_support): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("{} (ref): trying {} for the reference calculation".format( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_code(ref_compiled.get_code())) print(75 * "-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED ]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = max(warmup_rounds, 1) while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: {:g} s event, {:g} s wall{}".format( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def make_event(self, stream, timing=False): return EventWrapper(cl.enqueue_marker(stream.queue))
def wait_for_event(self, event): cl.enqueue_marker(self.queue, [event.event])
def run_aes32_concat(self, str1, lens, crypted_data1, crypted_data2, printspeed=False): def aes32(npstr1, lens, crypted): mf = cl.mem_flags # opencl memflag enum str1_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=npstr1) result = np.empty(npstr1.shape, dtype=np.dtype('V' + str(lens))) result_s = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte finish_event = self.prg.substr(self.queue, npstr1.shape, None, str1_g, result_s, strlen) finish_event.wait() #end substr start aes mf = cl.mem_flags # opencl memflag enum data_g = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=crypted) #result = np.zeros(len(hashkey), dtype=pyopencl.cltypes.long)# np.zeros(numberOftheElement,elementType) name should be sync with the "result variable" in OpenCL code result = np.empty(npstr1.shape, dtype=crypted.dtype) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte gdim = (npstr1.shape[0] * nAESblocks, ) finish_event = self.prg.func_pbkdf2(self.queue, gdim, None, result_s, data_g, result_g, nAESblocks) finish_event.wait() #******************End function Kernel****************** ,if set localsize (512,) to None,the runtime will automatically takes care of block/grid distribution mf = cl.mem_flags start_event = cl.enqueue_marker(self.queue) npstr1 = np.array(str1, dtype=np.void) strlen = np.int32(len(npstr1[0])) nAESblocks = np.int32(len(crypted_data1) / 16) crypted_data = [crypted_data1] #*len(hashkey) cryp1 = np.array(crypted_data1, dtype=np.void) result_g1 = aes32(npstr1, lens, cryp1) nAESblocks = np.int32(len(crypted_data2) / 16) crypted_data = [crypted_data2] #*len(hashkey) cryp2 = np.array(crypted_data2, dtype=np.void) result_g2 = aes32(npstr1, lens, cryp2) result = np.empty(npstr1.shape, dtype=np.dtype('V64')) result_g = cl.Buffer( self.ctx, mf.WRITE_ONLY, result.nbytes ) # size should be in byte, 1byte=8bit; notice that in python, bool=8bit=1byte finish_event = self.prg.concat_str(self.queue, npstr1.shape, None, result_g1, result_g2, result_g) finish_event.wait() rt = cl.enqueue_copy( self.queue, result, result_g ) # copy the result from device to host,type of "result" is a list of unsigned integer(32bit,4byte) rbs = [] i = 0 for rs in result: rbs.append(rs.tobytes()) #print(rbs[i].hex()) i += 1 return rbs
def test_wait_for_events(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue) cl.wait_for_events([evt1, evt2])
def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0): """ :arg arr: the array to be sorted. Will be overwritten with the sorted array. :arg idx: an array of indices to be tracked along with the sorting of *arr* :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue if None :arg wait_for: a list of :class:`pyopencl.Event` instances or None :arg axis: the axis of the array by which to sort :returns: a tuple (sorted_array, event) """ if queue is None: queue = arr.queue if wait_for is None: wait_for = [] wait_for = wait_for + arr.events last_evt = cl.enqueue_marker(queue, wait_for=wait_for) if arr.shape[axis] == 0: return arr, last_evt if not _is_power_of_2(arr.shape[axis]): raise ValueError("sorted array axis length must be a power of 2") if idx is None: argsort = 0 else: argsort = 1 run_queue = self.sort_b_prepare_wl( argsort, arr.dtype, idx.dtype if idx is not None else None, arr.shape, axis) knl, nt, wg, aux = run_queue[0] if idx is not None: if aux: last_evt = knl(queue, (nt, ), wg, arr.data, idx.data, cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * arr.dtype.itemsize), cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * idx.dtype.itemsize), wait_for=[last_evt]) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl(queue, (nt, ), wg, arr.data, idx.data, wait_for=[last_evt]) else: if aux: last_evt = knl(queue, (nt, ), wg, arr.data, cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * 4 * arr.dtype.itemsize), wait_for=[last_evt]) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl(queue, (nt, ), wg, arr.data, wait_for=[last_evt]) return arr, last_evt
def auto_test_vs_ref( ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.preprocess import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") print(get_highlighted_cl_code(ref_compiled.code)) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75*"-", "On %s:" % dev, 75*"-", traceback.format_exc(), 75*"-"])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop-ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n"+"\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) args = None from loopy.kernel import kernel_state if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75*"-") print("Kernel #%d:" % i) print(75*"-") if print_code: print(compiled.get_highlighted_code()) print(75*"-") if dump_binary: print(type(compiled.cl_program)) print(compiled.cl_program.binaries[0]) print(75*"-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % ( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
#!/usr/bin/env python import pyopencl as cl from fdtd3d import Fdtd3d from datetime import datetime import sys nx = int(sys.argv[1]) tmax = int(sys.argv[2]) ny, nz = nx, nx s = Fdtd3d(nx, ny, nz, target_device='gpu0', print_verbose=False) #ez = s.eh_fieldss[0][2] #ez[:] = np.random.rand(nx,nx,nx).astype(np.float32) #print ez.shape t0 = datetime.now() for tstep in xrange(1, tmax + 1): s.update_h() s.update_e() #ez[2*nx/3,ny/2,:] += np.sin(0.1*tstep) cl.enqueue_marker(s.queues[0]).wait() dt0 = datetime.now() - t0 dt = dt0.seconds + dt0.microseconds * 1e-6 print dt
def enqueue_marker(self) -> Event: return Event(pyopencl.enqueue_marker(self._pyopencl_command_queue))
def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0): """ :arg arr: the array to be sorted. Will be overwritten with the sorted array. :arg idx: an array of indices to be tracked along with the sorting of *arr* :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue if None :arg wait_for: a list of :class:`pyopencl.Event` instances or None :arg axis: the axis of the array by which to sort :returns: a tuple (sorted_array, event) """ if queue is None: queue = arr.queue if wait_for is None: wait_for = [] wait_for = wait_for + arr.events last_evt = cl.enqueue_marker(queue, wait_for=wait_for) if arr.shape[axis] == 0: return arr, last_evt if not _is_power_of_2(arr.shape[axis]): raise ValueError("sorted array axis length must be a power of 2") if idx is None: argsort = 0 else: argsort = 1 run_queue = self.sort_b_prepare_wl(argsort, arr.dtype, idx.dtype if idx is not None else None, arr.shape, axis) knl, nt, wg, aux = run_queue[0] if idx is not None: if aux: last_evt = knl( queue, (nt,), wg, arr.data, idx.data, cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * arr.dtype.itemsize), cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * idx.dtype.itemsize), wait_for=[last_evt], ) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl(queue, (nt,), wg, arr.data, idx.data, wait_for=[last_evt]) else: if aux: last_evt = knl( queue, (nt,), wg, arr.data, cl.LocalMemory(_tmpl.LOCAL_MEM_FACTOR * wg[0] * 4 * arr.dtype.itemsize), wait_for=[last_evt], ) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt]) return arr, last_evt
#!/usr/bin/env python import pyopencl as cl from fdtd3d import Fdtd3d from datetime import datetime import sys nx = int(sys.argv[1]) tmax = int(sys.argv[2]) ny, nz = nx, nx s = Fdtd3d(nx, ny, nz, target_device='gpu0', print_verbose=False) #ez = s.eh_fieldss[0][2] #ez[:] = np.random.rand(nx,nx,nx).astype(np.float32) #print ez.shape t0 = datetime.now() for tstep in xrange(1, tmax+1): s.update_h() s.update_e() #ez[2*nx/3,ny/2,:] += np.sin(0.1*tstep) cl.enqueue_marker(s.queues[0]).wait() dt0 = datetime.now() - t0 dt = dt0.seconds + dt0.microseconds * 1e-6 print dt
from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32) ctx = cl.create_some_context() queue_instruction = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) queue_data = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags MT_state_buf = cl.Buffer(ctx, mf.WRITE_ONLY, SIZE * MT_N * 4) MT_state_res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, MT_state_result.nbytes) prg = cl.Program(ctx, gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)).build() z = cl.enqueue_marker(queue_instruction) zzz = time.time() instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(0), MT_state_buf, MT_state_res_buf)#, g_times_l=True) data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,]) for i in xrange(TEST_ITERATIONS):#2**31 / SIZE): instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, wait_for=[data_event,])#, g_times_l=True) data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,]) data_event.wait() #for row in (tmp for tmp in MT_state_result[0]): # f.write('{0}\n'.format(row)) z2 = cl.enqueue_marker(queue_instruction) z2.wait()