def __init__(self, ctx=None, queue=None): self._PlanckConstantReduced = 1.0545717e-34 # wavelength of cooling laser lam = 313.0e-9 # wave vector self.k0 = numpy.array([0, 0, 2.0 * numpy.pi / lam], dtype=numpy.float32) self.x0 = numpy.array([0, 0, 0], dtype=numpy.float32) # 1/e radius of cooling laser self.sigma = 1.0e-3 # line width (unsaturated) self.gamma = 2.0 * numpy.pi * 19.0e6 # Detuning at zero velocity self.delta0 = -0.5 * self.gamma # Saturation parameter self.S = 0.1 self.ctx = ctx self.queue = queue if self.ctx == None: self.ctx = cl.create_some_context() if self.queue == None: self.queue = cl.CommandQueue( self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) absolutePathToKernels = os.path.dirname(os.path.realpath(__file__)) src = open(absolutePathToKernels + '/cooling_laser_advance.cl', 'r').read() self.program = cl.Program(self.ctx, src) try: self.program.build() except: print("Error:") print( self.program.get_build_info(self.ctx.devices[0], cl.program_build_info.LOG)) raise self.program.compute_mean_scattered_photons_homogeneous_beam.set_scalar_arg_dtypes( [ None, None, None, None, None, None, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.int32, None ]) self.program.compute_mean_scattered_photons_gaussian_beam.set_scalar_arg_dtypes( [ None, None, None, None, None, None, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.int32, None ]) self.program.countEmissions.set_scalar_arg_dtypes( [None, None, numpy.int32, None, numpy.int32]) self.program.computeKicks.set_scalar_arg_dtypes([ None, None, numpy.int32, None, numpy.float32, numpy.float32, numpy.float32, numpy.float32, numpy.float32, None, None, None, numpy.int32 ]) self.generator = cl_random.RanluxGenerator(self.queue, num_work_items=128, luxury=1, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None)
def loadProgram(self): src = reduce( lambda accum, filename: accum + open(filename, "r").read(), ["gpu_md5lib.cl", "gpu_brute.cl"], "") self.program = cl.Program(self.ctx, src).build()
def loadKernel(self, device): """Load the kernel and initialize the device.""" self.context = cl.Context([device], None, None) # These definitions are required for the kernel to function. self.defines += (' -DOUTPUT_SIZE=' + str(self.OUTPUT_SIZE)) self.defines += (' -DOUTPUT_MASK=' + str(self.OUTPUT_SIZE - 1)) # If the user wants to mine with vectors, enable the appropriate code # in the kernel source. if self.VECTORS: self.defines += ' -DVECTORS' # Some AMD devices support a special "bitalign" instruction that makes # bitwise rotation (required for SHA-256) much faster. if (device.extensions.find('cl_amd_media_ops') != -1): self.defines += ' -DBITALIGN' #enable the expierimental BFI_INT instruction optimization if self.BFI_INT: self.defines += ' -DBFI_INT' else: #since BFI_INT requires cl_amd_media_ops, disable it if self.BFI_INT: self.BFI_INT = False # Locate and read the OpenCL source code in the kernel's directory. kernelFileDir, pyfile = os.path.split(__file__) kernelFilePath = os.path.join(kernelFileDir, 'kernel.cl') kernelFile = open(kernelFilePath, 'r') kernel = kernelFile.read() kernelFile.close() # For fast startup, we cache the compiled OpenCL code. The name of the # cache is determined as the hash of a few important, # compilation-specific pieces of information. m = md5() m.update(device.platform.name) m.update(device.platform.version) m.update(device.name) m.update(self.defines) m.update(kernel) cacheName = '%s.elf' % m.hexdigest() fileName = os.path.join(kernelFileDir, cacheName) # Finally, the actual work of loading the kernel... try: binary = open(fileName, 'rb') except IOError: binary = None try: if binary is None: self.kernel = cl.Program(self.context, kernel).build(self.defines) #apply BFI_INT if enabled if self.BFI_INT: #patch the binary output from the compiler patcher = BFIPatcher(self.interface) binaryData = patcher.patch(self.kernel.binaries[0]) self.interface.debug("Applied BFI_INT patch") #reload the kernel with the patched binary self.kernel = cl.Program(self.context, [device], [binaryData]).build(self.defines) #write the kernel binaries to file binaryW = open(fileName, 'wb') binaryW.write(self.kernel.binaries[0]) binaryW.close() else: binaryData = binary.read() self.kernel = cl.Program(self.context, [device], [binaryData]).build(self.defines) except cl.LogicError: self.interface.fatal("Failed to compile OpenCL kernel!") return except PatchError: self.interface.fatal('Failed to apply BFI_INT patch to kernel! ' 'Is BFI_INT supported on this hardware?') return finally: if binary: binary.close() cl.unload_compiler() # If the user didn't specify their own worksize, use the maxium # supported by the device. maxSize = self.kernel.search.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) if self.WORKSIZE is None: self.WORKSIZE = maxSize else: if self.WORKSIZE > maxSize: self.interface.log( 'Warning: Worksize exceeds the maximum of ' + str(maxSize) + ', using default.') if self.WORKSIZE < 1: self.interface.log('Warning: Invalid worksize, using default.') self.WORKSIZE = min(self.WORKSIZE, maxSize) self.WORKSIZE = max(self.WORKSIZE, 1) #if the worksize is not a power of 2, round down to the nearest one if (self.WORKSIZE & (self.WORKSIZE - 1)) != 0: self.WORKSIZE = 1 << int(math.floor(math.log(X) / math.log(2))) self.interface.setWorkFactor(self.WORKSIZE)
my_gpu_devices = [platform[0].get_devices(device_type=cl.device_type.GPU)[1]] context = cl.Context(devices=my_gpu_devices) #context = cl.create_some_context() queue = cl.CommandQueue(context) # Create Opencl Buffers buffer_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=mat_a) buffer_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=mat_b) buffer_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, mat_c.nbytes) # Program program = cl.Program(context, c_dot_product_kernel).build() program.dotProduct.set_scalar_arg_dtypes([np.int32, None, None, None]) start_time = time() program.dotProduct(queue, (1024, ), (1024 / 16, ), widthA, buffer_a, buffer_b, buffer_c) queue.finish() run_time = time() ## Move the kernel's output data to host memory. #cl.enqueue_copy(queue, mat_c, buffer_c) cl.enqueue_copy(queue, mat_c, buffer_c)
def build_kernel(self, src): self.program = cl.Program(self.context, src).build(self.compiler_args)
aux_h = np.complex64(1 + 1j * 1) RES_h = np.empty_like(X1_h) dados_h = [] for i in range(3): dados_h.append( np.array([X1_h[i], X2_h[i], X3_h[i], Y1_h[i], Y2_h[i], Y3_h[i]]).astype(np.complex64)) dados_h = np.array(dados_h).astype(np.complex64) print dados_h aux_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=aux_h) dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h) RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=RES_h) Source = """ __kernel void soma( __global float2 *dados, __global float2 *res, int rowWidth){ const int gid_x = get_global_id(0); res[gid_x] = dados[gid_x*rowWidth+3]; } """ prg = cl.Program(ctx, Source).build() completeEvent = prg.soma(queue, (M, ), None, dados_d, RES_d, np.int32(6)) completeEvent.wait() cl.enqueue_copy(queue, RES_h, RES_d) print "GPU RES" print RES_h
def predict_opencl_atom(self, X, predict_class = False, single_cpu = conf.SINGLE_CPU, opencl_config = conf.OPENCL_CONFIG): ''' PyOpenCL implementation of the iPSM approach return: a vector of predictions, eacn for a row in X ''' print 'predict_opencl_atom() was called' try: t0 = time.time() c_evs = np.int32(self.__tileRasterReader.nbands) # standard deviation of each variable (over the whole study area) Std_evs = self.__tileRasterReader.statistics[:,3] SD_evs = Std_evs.reshape(c_evs).astype(np.float32) r, c = np.shape(X) nrows_X = np.int32(r) ncols_X = np.int32(c) X = X.reshape(nrows_X*ncols_X).astype(np.float32) MSRLEVES = self.__tileRasterReader.measurement_level_ints.reshape(c_evs).astype(np.int32) if not self.__samples_stats_collected: samples_X = self.__soilsamples.covariates_at_points.T nrows_samples = np.int32(samples_X.shape[1]) self.__nrows_samples = nrows_samples samples_SD_evs = np.zeros((nrows_samples, c_evs)) AVG_evs = self.__tileRasterReader.statistics[:,2] for i in range(nrows_samples): delta = samples_X[:,i].T - AVG_evs tmp = Std_evs**2 + delta**2 samples_SD_evs[i] = np.sqrt(tmp) self.__samples_SD_evs = np.array(samples_SD_evs).reshape(nrows_samples*c_evs).astype(np.float32) self.__samples_X = np.array(samples_X).T.reshape(nrows_samples*c_evs).astype(np.float32) # sample weights self.__sample_weights = self.__soilsamples.weights.reshape(nrows_samples).astype(np.float32) # sample attributes self.__sample_attributes = self.__soilsamples.attributes.reshape(nrows_samples).astype(np.float32) self.__samples_stats_collected = True # hold predictions for instances in X X_predictions = np.zeros(nrows_X).astype(np.float32) # hold prediction uncertainties for instances in X X_uncertainties = np.zeros(nrows_X).astype(np.float32) print 'preparation on HOST took', time.time() - t0, 's' ##### config computing platform and device for platform in cl.get_platforms(): #print platform.name if platform.name == conf.OPENCL_CONFIG['Platform']: PLATFORM = platform # Print each device per-platform for device in platform.get_devices(): #print device.name if device.name == conf.OPENCL_CONFIG['Device']: DEVICE = device break # opencl context ctx = cl.Context([DEVICE]) # opencl command queue queue = cl.CommandQueue(ctx) ##### allocate memory space on device mf = cl.mem_flags t0 = time.time() #evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=evs) SD_evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=SD_evs) X_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=X) MSRLEVES_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=MSRLEVES) sample_X_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__samples_X) ## added 09/06/2017 samples_SD_evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__samples_SD_evs) sample_weights_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__sample_weights) sample_attributes_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__sample_attributes) X_predictions_g = cl.Buffer(ctx, mf.WRITE_ONLY, X_predictions.nbytes) X_uncertainties_g = cl.Buffer(ctx, mf.WRITE_ONLY, X_uncertainties.nbytes) queue.finish() t1 = time.time()-t0 conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(t1) print 'allocate and copy from HOST to DEVICE took', t1, 's' X = None ##### build opencl kernel from code in the file f = open(conf.iPSM_KERNEL_FN, 'r') fstr = "".join(f.readlines()) fstr = fstr.replace("#define N_SAMPLES 100", "#define N_SAMPLES " + str(self.__nrows_samples)) prg = cl.Program(ctx, fstr).build() ##### opencl computation threshold = np.float32(self.__uncthreshold) if predict_class: mode = np.int32(1) else: mode = np.int32(0) print X_predictions.shape ## improved version, 09/06/2017 if not single_cpu: t0 = time.time() completeEvent = \ prg.iPSM_Predict(queue, X_predictions.shape, None, nrows_X, ncols_X, self.__nrows_samples, mode, \ threshold, MSRLEVES_g, samples_SD_evs_g, SD_evs_g, X_g, sample_X_g, sample_weights_g, sample_attributes_g, \ X_predictions_g, X_uncertainties_g) queue.finish() t1 = time.time() - t0 conf.TIME_KEEPING_DICT['parts']['compute'].append(t1) print 'kernel took', t1, 's' #print queue.finish() ## added on Oct. 7, 2018 [sequential version - CPU] else: print 'SINGLE_CPU iPSM.predict_opencl() called' t0 = time.time() completeEvent = \ prg.iPSM_Predict_Sequential(queue, (1,), (1,), nrows_X, ncols_X, self.__nrows_samples, mode, \ threshold, MSRLEVES_g, samples_SD_evs_g, SD_evs_g, X_g, sample_X_g, sample_weights_g, sample_attributes_g, \ X_predictions_g, X_uncertainties_g) queue.finish() t1 = time.time() - t0 conf.TIME_KEEPING_DICT['parts']['compute'].append(t1) print 'kernel took', t1, 's' #print queue.finish() #### wait until completions events = [completeEvent] queue.finish() print 'up to events finished kernel took', time.time() - t0, 's' #print queue.finish() ##### copy result data t0 = time.time() cl.enqueue_copy(queue, X_predictions, X_predictions_g, wait_for = events)#.wait() #print queue.finish() cl.enqueue_copy(queue, X_uncertainties, X_uncertainties_g) queue.finish() t1 = time.time() - t0 conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(t1) print 'copy from DEVICE to HOST took', t1, 's' y = np.vstack((X_predictions, X_uncertainties)).T #print y return y except Exception as e: raise
program = False try: import numpy import pyopencl as cl hash_dt = numpy.dtype([('target', numpy.uint64), ('v', numpy.str_, 73)]) gpus = [] for platform in cl.get_platforms(): gpus.extend(platform.get_devices(device_type=cl.device_type.GPU)) if (len(gpus) > 0): ctx = cl.Context(devices=gpus) queue = cl.CommandQueue(ctx) full_path = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(full_path, "bitmsghash", 'bitmsghash.cl'), 'r') fstr = ''.join(f.readlines()) program = cl.Program(ctx, fstr).build(options="") else: print "No OpenCL GPUs found" ctx = False except Exception as e: print "opencl fail: " + str(e) ctx = False def has_opencl(): return (ctx != False) def do_opencl_pow(hash, target): output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)]) if (ctx == False): return output[0][0]
def compile(self, bufferStructsObj, library_file, footer_file=None, N=15, invMemoryDensity=2): assert type(N) == int assert N < 20, "N >= 20 won't fit in a single buffer, so is unsupported. " + \ "Nothing sane should use 20, is this wickr?" self.N = N assert bufferStructsObj is not None, "need to supply a bufferStructsObj : set all to 0 if necessary" assert bufferStructsObj.code is not None, "bufferStructsObj should be initialised" bufStructs = bufferStructsObj self.wordSize = bufStructs.wordSize # set the np word type, for use in .run npType = { 4: np.uint32, 8: np.uint64, } self.wordType = npType[self.wordSize] if footer_file != None: src = bufStructs.code else: src = "" if library_file: with open( os.path.join(current_dir, "worker", "generic", library_file), "r") as rf: src += rf.read() if footer_file: with open( os.path.join(current_dir, "worker", "generic", footer_file), "r") as rf: src += rf.read() # Standardise to using no \r's, move to bytes to stop trickery src = src.encode("ascii") src = src.replace(b"\r\n", b"\n") # Debugging if self.write_combined_file: with open("combined_" + library_file, "wb") as wf: wf.write(src) # Convert back to text! src = src.decode("ascii") # Check that it starts with 2 newlines, for adding our defines if src.startswith("\n\n"): src = "\n\n" + src src = src[len("\n\n"):] # Prepend define N and invMemoryDensity defines = "#define N {}\n#define invMemoryDensity {}\n".format( N, invMemoryDensity) src = defines + src # Kernel function instantiation. Build returns self. prg = cl.Program(self.ctx, src).build() return prg
def initialise_opencl_object(self, program_src='', interactive=False, platform_pref=None, device_pref=None, default_group_size=None, default_num_groups=None, default_tile_size=None, default_threshold=None, transpose_block_dim=16, size_heuristics=[], required_types=[], all_sizes={}, user_sizes={}): self.ctx = get_prefered_context(interactive, platform_pref, device_pref) self.queue = cl.CommandQueue(self.ctx) self.device = self.ctx.get_info(cl.context_info.DEVICES)[0] # XXX: Assuming just a single device here. self.platform = self.ctx.get_info(cl.context_info.DEVICES)[0].platform self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) device_type = self.device.type check_types(self, required_types) max_group_size = int(self.device.max_work_group_size) max_tile_size = int(np.sqrt(self.device.max_work_group_size)) self.max_group_size = max_group_size self.max_tile_size = max_tile_size self.max_threshold = 0 self.max_num_groups = 0 self.free_list = {} default_sizes = apply_size_heuristics( self, size_heuristics, { 'group_size': default_group_size, 'tile_size': default_tile_size, 'num_groups': default_num_groups, 'lockstep_width': None, 'threshold': default_threshold }) default_group_size = default_sizes['group_size'] default_num_groups = default_sizes['num_groups'] default_threshold = default_sizes['threshold'] default_tile_size = default_sizes['tile_size'] lockstep_width = default_sizes['lockstep_width'] if default_group_size > max_group_size: sys.stderr.write( 'Note: Device limits group size to {} (down from {})\n'.format( max_tile_size, default_group_size)) default_group_size = max_group_size if default_tile_size > max_tile_size: sys.stderr.write( 'Note: Device limits tile size to {} (down from {})\n'.format( max_tile_size, default_tile_size)) default_tile_size = max_tile_size for (k, v) in user_sizes.items(): if k in all_sizes: all_sizes[k]['value'] = v else: raise Exception('Unknown size: {}'.format(k)) self.sizes = {} for (k, v) in all_sizes.items(): if v['class'] == 'group_size': max_value = max_group_size default_value = default_group_size elif v['class'] == 'num_groups': max_value = max_group_size # Intentional! default_value = default_num_groups elif v['class'] == 'tile_size': max_value = max_tile_size default_value = default_tile_size elif v['class'] == 'threshold': max_value = None default_value = default_threshold else: raise Exception('Unknown size class for size \'{}\': {}'.format( k, v['class'])) if v['value'] == None: self.sizes[k] = default_value elif max_value != None and v['value'] > max_value: sys.stderr.write( 'Note: Device limits {} to {} (down from {}\n'.format( k, max_value, v['value'])) self.sizes[k] = max_value else: self.sizes[k] = v['value'] if (len(program_src) >= 0): return cl.Program(self.ctx, program_src).build([ "-DFUT_BLOCK_DIM={}".format(transpose_block_dim), "-DLOCKSTEP_WIDTH={}".format(lockstep_width) ] + ["-D{}={}".format(s, v) for (s, v) in self.sizes.items()])
#print(s) #exit() hs = np.empty(nsamp, dtype=np.uint) #Distribution of sotred indexes to new genome hs.fill(0) for x in range(0, len(s) - 1): sx = np.arange(s[x], s[x + 1]).astype(np.uint) for sxi in sx: if sxi < len(hs): hs[sxi] = x print("hs == ", hs) defines = \ "#define nvarsd "+str(nvarsd)+"\n"+\ "#define nvarsg "+str(nvarsg)+"\n"+\ "#define ninpt "+str(ninpt)+"\n\n" kernels = genn.genkern2(tosumr, topology, lambda x: cl.Program(ctx, x).build()) print(kernels) #uint hs["""+str(len(hs))+"""] = {"""+", ".join([str(hh) for hh in hs])+"""}; //Indexes for allocate cutted population to full prsrc = """ __kernel void copy_inp(__global float *inpt, __global float *dnr){ uint gid = get_global_id(0); dnr[gid] = inpt[gid]; } __kernel void replicate_mutate(__global float *_gms, __global float *_tmpgms,\ __global uint *srt_idxs, __global float *res_g,\ __global float *_rnd, __global uint *_nvarsg, __global uint *_shiftsg, __constant uint *hs) { uint gid = get_global_id(0); uint h = hs[gid];
def __init__(self, queue, num_work_items=None, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): """ :param queue: :class:`pyopencl.CommandQueue`, only used for initialization :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest and 4 produces the best numbers. It can also be >=24, in which case it directly sets the p-value of RANLUXCL. :param num_work_items: is the number of generators to initialize, usually corresponding to the number of work-items in the NDRange RANLUXCL will be used with. May be `None`, in which case a default value is used. :param max_work_items: should reflect the maximum number of work-items that will be used on any parallel instance of RANLUXCL. So for instance if we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be generated by calling ranluxcl_intialization with numWorkitems = 5120 while GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it must be set to the same value for both. (may be `None`) .. versionchanged:: 2013.1 Added default value for `num_work_items`. """ from warnings import warn warn( "Ranlux random number generation is deprecated and will go away " "in 2022.", DeprecationWarning, stacklevel=2) if luxury is None: luxury = 4 if num_work_items is None: if queue.device.type & cl.device_type.CPU: num_work_items = 8 * queue.device.max_compute_units else: num_work_items = 64 * queue.device.max_compute_units if seed is None: from time import time seed = int(time() * 1e6) % 2 << 30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include <pyopencl-ranluxcl.cl> kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type & cl.device_type.CPU): wg_size = (1, ) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items, ), self.wg_size, np.uint32(seed), self.state.data)
def get_gen_kernel(self, dtype, distribution): size_multiplier = 1 arg_dtype = dtype rng_key = (distribution, dtype) if rng_key in [("uniform", np.float64), ("normal", np.float64)]: c_type = "double" scale1_const = "((double) %r)" % (1 / 2**32) scale2_const = "((double) %r)" % (1 / 2**64) if distribution == "normal": transform = "box_muller" else: transform = "" rng_expr = ("shift + scale * " "%s( %s * convert_double4(gen)" "+ %s * convert_double4(gen))" % (transform, scale1_const, scale2_const)) counter_multiplier = 2 elif rng_key in [(dist, cmp_dtype) for dist in ["normal", "uniform"] for cmp_dtype in [ np.float32, cltypes.float2, cltypes.float3, cltypes.float4, ]]: c_type = "float" scale_const = "((float) %r)" % (1 / 2**32) if distribution == "normal": transform = "box_muller" else: transform = "" rng_expr = ("shift + scale * %s(%s * convert_float4(gen))" % (transform, scale_const)) counter_multiplier = 1 arg_dtype = np.float32 try: _, size_multiplier = cltypes.vec_type_to_scalar_and_count[ dtype] except KeyError: pass elif rng_key == ("uniform", np.int32): c_type = "int" rng_expr = ( "shift + convert_int4((convert_long4(gen) * scale) / %s)" % (str(2**32) + "l")) counter_multiplier = 1 elif rng_key == ("uniform", np.int64): c_type = "long" rng_expr = ("shift" "+ convert_long4(gen) * (scale/two32) " "+ ((convert_long4(gen) * scale) / two32)".replace( "two32", (str(2**32) + "l"))) counter_multiplier = 2 else: raise TypeError( "unsupported RNG distribution/data type combination '%s/%s'" % rng_key) kernel_name = f"rng_gen_{self.generator_name}_{distribution}" src = """//CL// #include <{header_name}> #ifndef M_PI #ifdef M_PI_F #define M_PI M_PI_F #else #define M_PI 3.14159265359f #endif #endif typedef {output_t} output_t; typedef {output_t}4 output_vec_t; typedef {gen_name}_ctr_t ctr_t; typedef {gen_name}_key_t key_t; uint4 gen_bits(key_t *key, ctr_t *ctr) {{ union {{ ctr_t ctr_el; uint4 vec_el; }} u; u.ctr_el = {gen_name}(*ctr, *key); if (++ctr->v[0] == 0) if (++ctr->v[1] == 0) ++ctr->v[2]; return u.vec_el; }} #if {include_box_muller} output_vec_t box_muller(output_vec_t x) {{ #define BOX_MULLER(I, COMPA, COMPB) \ output_t r##I = sqrt(-2*log(x.COMPA)); \ output_t c##I; \ output_t s##I = sincos((output_t) (2*M_PI) * x.COMPB, &c##I); BOX_MULLER(0, x, y); BOX_MULLER(1, z, w); return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1); }} #endif #define GET_RANDOM_NUM(gen) {rng_expr} kernel void {kernel_name}( int k1, #if {key_length} > 2 int k2, int k3, #endif int c0, int c1, int c2, int c3, global output_t *output, long out_size, output_t scale, output_t shift) {{ #if {key_length} == 2 key_t k = {{{{get_global_id(0), k1}}}}; #else key_t k = {{{{get_global_id(0), k1, k2, k3}}}}; #endif ctr_t c = {{{{c0, c1, c2, c3}}}}; // output bulk unsigned long idx = get_global_id(0)*4; while (idx + 4 < out_size) {{ output_vec_t ran = GET_RANDOM_NUM(gen_bits(&k, &c)); vstore4(ran, 0, &output[idx]); idx += 4*get_global_size(0); }} // output tail output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c)); if (idx < out_size) output[idx] = tail_ran.x; if (idx+1 < out_size) output[idx+1] = tail_ran.y; if (idx+2 < out_size) output[idx+2] = tail_ran.z; if (idx+3 < out_size) output[idx+3] = tail_ran.w; }} """.format(kernel_name=kernel_name, gen_name=self.generator_name, header_name=self.header_name, output_t=c_type, key_length=self.key_length, include_box_muller=int(distribution == "normal"), rng_expr=rng_expr) prg = cl.Program(self.context, src).build() knl = getattr(prg, kernel_name) knl.set_scalar_arg_dtypes([np.int32] * (self.key_length - 1 + 4) + [None, np.int64, arg_dtype, arg_dtype]) return knl, counter_multiplier, size_multiplier
def get_gen_kernel(self, dtype, distribution="uniform"): size_multiplier = 1 arg_dtype = dtype if dtype == np.float64: bits = 64 c_type = "double" rng_expr = "(shift + scale * gen)" elif dtype == np.float32: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" elif dtype == cltypes.float2: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" size_multiplier = 2 arg_dtype = np.float32 elif dtype in [cltypes.float3, cltypes.float4]: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" size_multiplier = 4 arg_dtype = np.float32 elif dtype == np.int32: assert distribution == "uniform" bits = 32 c_type = "int" rng_expr = ("(shift " "+ convert_int4((float) scale * gen) " "+ convert_int4(((float) scale / (1<<24)) * gen))") elif dtype == np.int64: assert distribution == "uniform" if self.support_double: bits = 64 else: bits = 32 c_type = "long" rng_expr = ("(shift " "+ convert_long4((float) scale * gen) " "+ convert_long4(((float) scale / (1l<<24)) * gen)" "+ convert_long4(((float) scale / (1l<<48)) * gen)" ")") else: raise TypeError("unsupported RNG data type '%s'" % dtype) rl_flavor = "%d%s" % (bits, { "uniform": "", "normal": "norm" }[distribution]) src = """//CL// %(defines)s #include <pyopencl-ranluxcl.cl> typedef %(output_t)s output_t; typedef %(output_t)s4 output_vec_t; #define NUM_WORKITEMS %(num_work_items)d #define RANLUX_FUNC ranluxcl%(rlflavor)s #define GET_RANDOM_NUM(gen) %(rng_expr)s kernel void generate( global ranluxcl_state_t *ranluxcltab, global output_t *output, unsigned long out_size, output_t scale, output_t shift) { ranluxcl_state_t ranluxclstate; ranluxcl_download_seed(&ranluxclstate, ranluxcltab); // output bulk unsigned long idx = get_global_id(0)*4; while (idx + 4 < out_size) { output_vec_t ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)); vstore4(ran, 0, &output[idx]); idx += 4*NUM_WORKITEMS; } // output tail output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)); if (idx < out_size) output[idx] = tail_ran.x; if (idx+1 < out_size) output[idx+1] = tail_ran.y; if (idx+2 < out_size) output[idx+2] = tail_ran.z; if (idx+3 < out_size) output[idx+3] = tail_ran.w; ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "rlflavor": rl_flavor, "output_t": c_type, "num_work_items": self.num_work_items, "rng_expr": rng_expr } prg = cl.Program(self.context, src).build() knl = prg.generate knl.set_scalar_arg_dtypes( [None, None, np.uint64, arg_dtype, arg_dtype]) return knl, size_multiplier
class GLCharacter: '''GLCharacter is version of GLMeshes that supports blend skinning. TODO it hasn't been properly integrated yet.''' def __init__(self, names, verts, faces, bones=None, transforms=None, drawStyle='smooth', colour=[0.9, 0.9, 0.9, 1.0], vts=None, fts=None, visible=True): self.selectedIndex = -1 self.numGeos = len(names) self.visible = visible self.gvs = None self.boneIndices = None self.pose = None assert self.numGeos == len(verts), 'Non-matching parameter lists.' assert self.numGeos == len(faces), 'Non-matching parameter lists.' if transforms is None: transforms = [None] * self.numGeos if bones is None: bones = [None] * self.numGeos if vts is None: vts = [None] * self.numGeos if fts is None: fts = [None] * self.numGeos self.transforms = np.zeros((self.numGeos, 4, 4), dtype=np.float32) vs,VTs,es,bs,tris,vtis,vs_mapping,vts_mapping = [],[],[],[],[],[],[],[] vsplits, esplits, tsplits, bsplits = [0], [0], [0], [0] vs_in_total = 0 for i, (v, f, b, t, vt, ft) in enumerate(zip(verts, faces, bones, transforms, vts, fts)): #print i,names[i] voffset = len(vs) vt_indices = v_indices = np.arange(len(v), dtype=np.int32) if ft is not None: f_flat = [x for y in f for x in y] ft_flat = [x for y in ft for x in y] s = list(set(zip(f_flat, ft_flat))) d = dict(zip(s, range(len(s)))) v_indices, vt_indices = np.array(zip(*s), dtype=np.int32) f = [ np.array([d[x] for x in zip(*y)], dtype=np.int32) for y in zip(f, ft) ] vs_mapping.extend(v_indices + vs_in_total) vts_mapping.extend(vt_indices + vs_in_total) vs_in_total += len(v) v = np.array(v, dtype=np.float32).reshape(-1, 3)[v_indices] if vt is None: vt = np.zeros((len(vt_indices), 2), dtype=np.float32) # TODO missing verts vt = np.array(vt, dtype=np.float32).reshape(-1, 2)[vt_indices] vs.extend(v) # TODO is this slow? faster to use np.concatenate? VTs.extend(vt) vtis.extend([i] * len(v)) if b is not None: bs.extend(np.array(b, dtype=np.int32).reshape(-1, 2) + voffset) self.transforms[i] = np.eye(4) if t is not None: self.transforms[i, :, :3] = t.T if len(f) == 2 and f[1][0] == 0: # assume this is faces and splits f0 = np.array(f[0], dtype=np.int32) + voffset for c0, c1 in zip(f[1][:-1], f[1][1:]): fc = f0[c0:c1] es.append((fc[-1], fc[0])) es.append((fc[0], fc[1])) for fi in xrange(2, len(fc)): tris.append((fc[0], fc[fi - 1], fc[fi])) es.append((fc[fi - 1], fc[fi])) else: try: # see if the mesh is regular fr = np.array( f, dtype=np.int32 ) + voffset # will fail if not rectangular ints numFaces, faceSize = fr.shape # will fail if not size 2 e = np.zeros((numFaces, faceSize, 2), dtype=np.int32) t = np.zeros((numFaces, faceSize - 2, 3), dtype=np.int32) e[:, 0, 0] = fr[:, -1] e[:, 0, 1] = fr[:, 0] e[:, 1, 0] = fr[:, 0] e[:, 1, 1] = fr[:, 1] t[:, :, 0] = fr[:, 0].reshape(-1, 1) for fi in xrange(2, faceSize): e[:, fi, 0] = fr[:, fi - 1] e[:, fi, 1] = fr[:, fi] t[:, fi - 2, 1] = fr[:, fi - 1] t[:, fi - 2, 2] = fr[:, fi] e = e.reshape(-1, 2) t = t.reshape(-1, 3) es.extend(e) tris.extend(t) except Exception, e: for fc in f: fc = np.array(fc, dtype=np.int32) + voffset es.append((fc[-1], fc[0])) es.append((fc[0], fc[1])) for fi in xrange(2, len(fc)): tris.append((fc[0], fc[fi - 1], fc[fi])) es.append((fc[fi - 1], fc[fi])) vsplits.append(len(vs)) esplits.append(len(es)) bsplits.append(len(bs)) tsplits.append(len(tris)) self.vsplits = np.array(vsplits, dtype=np.int32) self.esplits = np.array(esplits, dtype=np.int32) self.bsplits = np.array(bsplits, dtype=np.int32) self.tsplits = np.array(tsplits, dtype=np.int32) self.vs_mapping = np.array(vs_mapping, dtype=np.int32) self.vts_mapping = np.array(vts_mapping, dtype=np.int32) self.names = names vs = np.array(vs, dtype=np.float32).reshape(-1, 3) tris = np.array(tris, dtype=np.int32).reshape(-1, 3) edges = np.array(es, dtype=np.int32).reshape(-1, 2) bones = np.array(bs, dtype=np.int32).reshape(-1, 2) VTs = np.array(VTs, dtype=np.float32).reshape(-1, 2) #print 'lens',len(vs), len(tris), len(edges), (np.min(tris),np.max(tris)) if len(tris) else 'None', (np.min(edges), np.max(edges)) if len(edges) else 'None', (np.min(bones), np.max(bones)) if len(bones) else 'None' self.num_in_verts = vs_in_total self.vs = vbo.VBO(vs, usage='GL_STATIC_DRAW_ARB') self.tris = vbo.VBO(tris, target=GL.GL_ELEMENT_ARRAY_BUFFER, usage='GL_STATIC_DRAW_ARB') self.edges = vbo.VBO(edges, target=GL.GL_ELEMENT_ARRAY_BUFFER, usage='GL_STATIC_DRAW_ARB') self.bones = vbo.VBO(bones, target=GL.GL_ELEMENT_ARRAY_BUFFER, usage='GL_STATIC_DRAW_ARB') self.vtis = vbo.VBO(np.array(vtis, dtype=np.int32), usage='GL_STATIC_DRAW_ARB') assert len(vtis) == len(vs) self.vts, self.vns = None, None # TODO, deal with input textures and normals if vts is not None: self.vts = vbo.VBO(VTs, usage='GL_STATIC_DRAW_ARB') #if vns is not None: self.vns = vbo.VBO(np.array(vns,dtype=np.float32), usage='GL_STATIC_DRAW_ARB') self.drawStyle = drawStyle # 'wire','smooth','wire_over_smooth' self.colour = colour self.image, self.bindImage, self.bindId = None, None, None self.GL_is_initialised = False global CL_ctx, CL_queue if CL_ctx is None: CL_ctx = cl.create_some_context(False) CL_queue = cl.CommandQueue(CL_ctx) self.cl_prg = cl.Program( CL_ctx, ''' __kernel void compute_normals(__global const float *xs_g, __global const int *edgeList_g, __global float *res_g) { const int gid = get_global_id(0); const int g10 = gid*10; const int g3 = gid*3; float sx=0,sy=0,sz=0; const float x=xs_g[g3],y=xs_g[g3+1],z=xs_g[g3+2]; int e3 = edgeList_g[g10]*3; float ex0 = xs_g[e3]-x, ey0 = xs_g[e3+1]-y, ez0 = xs_g[e3+2]-z; for (int i = 1; i < 10; ++i) { e3 = edgeList_g[g10+i]*3; if (xs_g[e3] > 1e10) continue; float ex1 = xs_g[e3]-x, ey1 = xs_g[e3+1]-y, ez1 = xs_g[e3+2]-z; sx += ey0*ez1-ey1*ez0; sy += ez0*ex1-ez1*ex0; sz += ex0*ey1-ex1*ey0; ex0=ex1; ey0=ey1; ez0=ez1; } const float sum = sx*sx+sy*sy+sz*sz; if (sum < 1e-8) { sx = 0; sy = 0; sz = 0; } else { const float sc = rsqrt(sum); sx *= sc; sy *= sc; sz *= sc; } res_g[g3] = sx; res_g[g3+1] = sy; res_g[g3+2] = sz; } ''').build() self.edgeList = self.trianglesToEdgeList(tris, len(vs)) self.edgeList_g = cl.Buffer(CL_ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.edgeList) if self.vns is None: vns = self.computeNormalsFromEdgeList(vs) self.vns = vbo.VBO(np.array(vns, dtype=np.float32), usage='GL_STATIC_DRAW_ARB')
mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) prg = cl.Program( ctx, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); float a_temp; float b_temp; float c_temp; a_temp = a[gid]; // my a element (by global ref) b_temp = b[gid]; // my b element (by global ref) c_temp = a_temp+b_temp; // sum of my elements c_temp = c_temp * c_temp; // product of sums c_temp = c_temp * (a_temp/2.0); // times 1/2 my a c[gid] = c_temp; // store result in global memory } """).build() global_size = (data_points, ) local_size = (workers, ) preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \ device)
import pyopencl as cl # Import the OpenCL GPU computing API import pyopencl.array as pycl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) import numpy as np # Import Numpy number tools context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue a1=np.random.rand(50000).astype(np.float64) b1=np.random.rand(50000).astype(np.float64) a = pycl_array.to_device(queue, a1) b = pycl_array.to_device(queue, b1) # Create two random pyopencl arrays c = pycl_array.empty_like(a) # Create an empty pyopencl destination array program = cl.Program(context, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); c[i] = a[i] + b[i]; }""").build() # Create the OpenCL program time1 = time() program.sum(queue, a.shape, None, a.data, b.data, c.data) # Enqueue the program for execution and store the result in c print("a: {}".format(a)) print("b: {}".format(b)) print("c: {}".format(c)) # Print all three arrays, to show sum() worked OpenCL: 0.0075032711029052734 s time2 = time() print("OpenCL: ", time2 - time1, "s")
""" # # OpenCL setup. # kernel_code = pyOpenCLNCS.loadNCSKernel() + kernel_code # Create context and command queue platform = cl.get_platforms()[0] devices = platform.get_devices() context = cl.Context(devices) queue = cl.CommandQueue( context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Open program file and build program = cl.Program(context, kernel_code) try: program.build() except: print("Build log:") print(program.get_build_info(devices[0], cl.program_build_info.LOG)) raise n_pts = 256 def test_veccopy(): v1 = numpy.zeros(n_pts, dtype=numpy.float32) v2 = numpy.random.uniform(low=1.0, high=10.0, size=n_pts).astype(dtype=numpy.float32)
prg = cl.Program(ctx, """ inline uint popcnt(const uint i) { uint n; asm("popc.b32 %0, %1;" : "=r"(n) : "r" (i)); return n; } inline uint ballot(const uint i) { uint n; asm( "{\\n\\t" ".reg .pred %%p<1>;\\n\\t" "setp.ne.u32 %%p1, %1, 0;\\n\\t" "vote.ballot.b32 %0, %%p1;\\n\\t" "}" : "=r"(n) : "r" (i) ); return n; } __kernel void sum(__global float *a_g, __global unsigned int *b_g) { uint res = 0; asm("mov.u32 %0, %%laneid;" : "=r"(res)); unsigned int res2; // uint comp; res = a_g[0]; // res += 23; // res = res > 37 ? 5 : 99; asm( //".reg .pred %%p<2>;" "mov.u32 %0, %1;" //"add.u32 %0, %0, 7;" //"mov.u32 %0, %%laneid;" // "setp.gt.u32 %%p1, %0, 12;" // "@%%p1 mov.u32 %0, 33;" : "=r"(res2) : "r"(res) ); res2 = a_g[get_global_id(0)] > 0 ? 1 : 0; res = ballot(res2) & 0xffffffff; //res2 = popcnt(res2); b_g[get_global_id(0)] = get_global_id(0) == 31 ? res : res2; // if(get_global_id(0) == 0) { // a_g[31] = res; //} } """).build()
print '---------------------------' # Create a context with all the devices devices = platforms[0].get_devices() context = cl.Context(devices) print 'This context is associated with ', len(context.devices), 'devices' # Create a queue for transferring data and launching computations. # Turn on profiling to allow us to check event times. queue = cl.CommandQueue( context, context.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name program = cl.Program(context, open('bilateral.cl').read()).build(options='') input_image = np.load('image.npz')['image'].astype(np.float32) #input_image = im.imread('img/cat.png').astype(np.float32) print "Input image size:", input_image.shape # use this input to check correctness of index trick ''' input_image = np.array([[1,1,1,1,1,1,1,1], [2,2,2,2,2,2,2,2], [3,3,3,3,3,3,3,3], [4,4,4,4,4,4,4,4], [5,5,5,5,5,5,5,5], [6,6,6,6,6,6,6,6], [7,7,7,7,7,7,7,7], [8,8,8,8,8,8,8,8],
#################################### ctx = cl.create_some_context() queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags ###################### #CREATING I/O BUFFERS# ###################### inp_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=input_img) out_buf = cl.Buffer(ctx, mf.WRITE_ONLY, out_cl.nbytes) ################## #BUILDING PROGRAM# ################## prg = cl.Program(ctx, kernel).build() ###################### #CALLING THE FUNCTION# ###################### prg.makeCodeBlocks(queue, out_cl.shape, None, inp_buf, out_buf, input_size) ######################################### #RETRIEVING THE CODEBOOK FROM THE DEVICE# ######################################### cl.enqueue_copy(queue, out_cl, out_buf) ################################## #INITIALIZING OUTPUT FOR ENCODING# ################################## final_scales = np.zeros((input_size / 4, input_size / 4), dtype=np.float32)
def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '<u4'), ('value', '<f4')] >>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted(six.iteritems(dtype.fields), key=lambda name_dtype_offset: name_dtype_offset[1][1]) c_fields = [] for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" % (i + 1, name, field_name) for i, (field_name, _) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((uint) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global uint *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict(pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1 + len(fields), np.uint32) knl(queue, (1, ), (1, ), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [ dtype_and_offset[1] for field_name, dtype_and_offset in fields ] else: raise RuntimeError( "OpenCL compiler reported offsetof() past sizeof() " "for struct layout on '%s'. " "This makes no sense, and it's usually indicates a " "compiler bug. " "Refusing to discover struct layout." % device) result_buf.data.release() del knl del prg del queue del context try: dtype_arg_dict = { 'names': [field_name for field_name, (field_dtype, offset) in fields], 'formats': [field_dtype for field_name, (field_dtype, offset) in fields], 'offsets': [int(x) for x in offsets], 'itemsize': int(size_and_offsets[0]), } dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1) dtype = np.dtype(dtype_arg_dict) except NotImplementedError: def calc_field_type(): total_size = 0 padding_count = 0 for offset, (field_name, (field_dtype, _)) in zip(offsets, fields): if offset > total_size: padding_count += 1 yield ('__pycl_padding%d' % padding_count, 'V%d' % offset - total_size) yield field_name, field_dtype total_size = field_dtype.itemsize + offset dtype = np.dtype(list(calc_field_type())) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl
def clbuild(cl_ctx, prg): return cl.Program(cl_ctx, prg).build()
def initialize(cls): ''' Compile kernels ''' cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build() cls.longitudinal_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len")], key_expr="(int) floor(theta[i]*inv_slice_len)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], key_dtype=np.int32) class LongitudinalTraverseScanKernel(GenericScanKernel): ''' Adds a preamble method for the longitudinal traverse sort ''' def __init__(self, *argl, **argd): ''' Patch argd['preamble'] ''' sort_fun = ''' int sort_fun(FLOAT_TYPE x, FLOAT_TYPE y, FLOAT_TYPE theta, FLOAT_TYPE inv_slice_len, FLOAT_TYPE inv_traverse_len, int bins) { FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x); FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y); int xbin = (int) floor(xnorm * inv_traverse_len); int ybin = (int) floor(ynorm * inv_traverse_len); int zbin = (int) floor(theta*inv_slice_len); if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) { xbin = 0; ybin = 0; } return xbin+bins*(ybin+bins*zbin); } ''' new_argd = dict(argd) new_argd['preamble'] = F(sort_fun + new_argd['preamble']) super().__init__(*argl, **new_argd) cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len"), ScalarArg(cl_ftype, "inv_traverse_len"), ScalarArg(np.int32, "bins")], key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], scan_kernel = LongitudinalTraverseScanKernel, key_dtype=np.int32)
def mercatorToEquirectangular(src, dest, north, south): sh, sw = src.shape dh, dw = dest.shape src = src.reshape(-1) dest = dest.reshape(-1) northY = math.log(math.tan(math.pi / 4.0 + math.radians(north) / 2.0)) southY = math.log(math.tan(math.pi / 4.0 + math.radians(south) / 2.0)) # the kernel function srcCode = """ static float lerp(float a, float b, float mu) { return (b - a) * mu + a; } static float norm(float value, float a, float b) { float n = (value - a) / (b - a); if (n > 1.0) { n = 1.0; } if (n < 0.0) { n = 0.0; } return n; } __kernel void doProjection(__global uchar *source, __global uchar *dest){ int sw = %d; int sh = %d; int dw = %d; int dh = %d; float north = %f; float south = %f; float northY = %f; float southY = %f; float piq = %f; // get dest position int x = get_global_id(1); int y = get_global_id(0); int i = y * dw + x; // get normalized position float nx = (float) x / (float) (dw-1); float ny = (float) y / (float) (dh-1); // get lat float lat = lerp(north, south, ny); // convert lon lat from mercator to equirectangular float nmy = ny; float my = (float) tan(piq + (float) radians(lat) / (float) 2.0); if (my > 0) { my = log(my); nmy = norm(my, northY, southY); } // get source position int sx = (int) round(nx * (float) (sw-1)); int sy = (int) round(nmy * (float) (sh-1)); int j = sy * sw + sx; // assign pixel dest[i] = source[j]; } """ % (sw, sh, dw, dh, north, south, northY, southY, math.pi / 4.0) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print "Warning: using CPU instead of GPU" ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, srcCode).build() bufIn = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=src) bufOut = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes) prg.doProjection(queue, [dh, dw], None , bufIn, bufOut) # Copy result cl.enqueue_copy(queue, dest, bufOut) dest = dest.reshape(dh, dw) return dest
nSmp = 1000 M = np.tile(M, (1, nSmp, 1, 1)) v = np.ones((nSmp, m*3)) y = np.zeros((nSmp, m*3)) # Setup the OpenCL environment. platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) # Start with the most original one without any optimization. kernelsource = open("spMV0.cl").read() program = cl.Program(context, kernelsource).build() # mmul = program.mmul # mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None]) queue = cl.CommandQueue(context) # localWorkSize = 256 localWorkSize = 64 num_compute_units = device.max_compute_units globalWorkSize = 8 * num_compute_units * localWorkSize print('num of computing unites {}'.format(num_compute_units)) mem_flags = cl.mem_flags indptr_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = indptr) indices_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = indices) matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = M)
mf = cl.mem_flags d_pos = cl.array.to_device(queue, pos) d_preresult = cl.array.empty(queue, (4 * workgroup_size, ), dtype=numpy.float32) d_minmax = cl.array.empty(queue, (4, ), dtype=numpy.float32) with open("../openCL/ocl_lut_pixelsplit.cl", "r") as kernelFile: kernel_src = kernelFile.read() compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%e" % \ (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps) print(compile_options) program = cl.Program(ctx, kernel_src).build(options=compile_options) program.reduce1(queue, (workgroup_size * workgroup_size, ), (workgroup_size, ), d_pos.data, numpy.uint32(pos_size), d_preresult.data) program.reduce2(queue, (workgroup_size, ), (workgroup_size, ), d_preresult.data, d_minmax.data) min0 = pos[:, :, 0].min() max0 = pos[:, :, 0].max() min1 = pos[:, :, 1].min() max1 = pos[:, :, 1].max() minmax = (min0, max0, min1, max1) print(minmax) print(d_minmax)
/* Set float data */ float f = global_id_0 * 10.0f + global_id_1 * 1.0f; f += local_id_0 * 0.1f + local_id_1 * 0.01f; output[index] = f; } ''' # Get device and context, create command queue and program dev = utility.get_default_device() context = cl.Context(devices=[dev]) queue = cl.CommandQueue(context, dev) # Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror'], devices=[dev]) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Create output buffer out = np.zeros(shape=(4, 6), dtype=np.float32) buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=out.nbytes) # Enqueue kernel (with argument specified directly) global_offset = (3, 5) global_size = (6, 4) local_size = (3, 2)
def calc_errs(data, mask, W, O, pixel_map, n0, m0, dij_n, ss, fs): # demand that the data is float32 to avoid excess mem. usage assert (data.dtype == np.float32) assert (ss.dtype == np.int) assert (fs.dtype == np.int) import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() translations_err_cl = program.translations_err translations_err_cl.set_scalar_arg_dtypes(8 * [None] + 2 * [np.float32] + 6 * [np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = translations_err_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) ns = np.arange(data.shape[0]).astype(np.int32) # outputs: dij_nout = dij_n.copy() errs = np.empty((len(ss), data.shape[0]), dtype=np.float32) out = np.zeros(data.shape[0]).astype(np.float32) step = max_comp for i in range(len(ss)): #for n in tqdm.tqdm(np.arange(ns.shape[0])[::step], desc='updating sample translations'): for n in np.arange(ns.shape[0])[::step]: nsi = ns[n:n + step:] translations_err_cl(queue, (nsi.shape[0], 1), (1, 1), cl.SVM(Win), cl.SVM(data), cl.SVM(Oin), cl.SVM(pixel_mapin), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(nsi), cl.SVM(out), n0, m0, data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss[i], fs[i]) queue.finish() errs[i] = out return errs
def PolHealpixMapper(dx, nside, ext, obspos, nH, Snu, Bx, By, Bz, GPU=0, y_shear=0.0, \ maxlos=1e30, minlos=0., p0=0.2, polred=0): """ Usage: I, Q, U = PolHealpixMapper(dx, nside, ext, obspos, nH, Snu, Bx, By, Bz) Input: dx = cell size [pc] nside = parameter of the resulting Healpix map (with 12*nside*nside pixels) ext = dust extinction [1/pc/H] obspos = position of the observer [x,y,z], relative to the centre of the model [pc] nH = density values [H], grid of [Nx, Ny, Nz] values Snu = emission/emissivity [MJy/sr/H/pc] Bx ... = magnetic field values [arbitrary units], [Nx, Ny, Nz] values each GPU = if ==1, try to use a GPU instead of a CPU (default=0) y_shear = shear in y direction [cells] maxlos = maximum integration length along the LOS [pc] p0 = maximum polarisation fraction, default value 0.2 polred = (int) if >0, interpret |B| as polarisation fraction; default=0 => (Q,U) calculated for p=100% Return: I, Q, U, NH = vectors of Healpix pixel values, for the requested nside, in RING order. Note: If y_shear==0.0, integration extends to the distance maxlos or to the model boundary, whichever is smaller. If y_shear!=0, integration does not stop at X and Y boundaries but only when either MAXLOS or +/- Z boundary is reached. """ NZ, NY, NX = nH.shape NPIX = 12 * nside * nside platform, device, context, queue, mf = InitCL(GPU) LOCAL = [8, 32][GPU > 0] GLOBAL = NPIX if (GLOBAL % LOCAL != 0): GLOBAL = ((GLOBAL / 32) + 1) * 32 source = open("kernel_HP_map.c").read() OPT = \ " -D NZ=%d -D NY=%d -D NX=%d -D NSIDE=%d -D DX=%.5ef -D MAXLOS=%.4ef -D MINLOS=%.4ef -D POLRED=%d -D p0=%.4ef" % \ (NZ, NY, NX, nside, dx, maxlos/dx, minlos/dx, polred, p0) # note -- in kernel [maxlos]=GL, not pc program_map = cl.Program(context, source).build(OPT) kernel_map = program_map.PolHealpixMapping kernel_map.set_scalar_arg_dtypes([ np.float32, clarray.cltypes.float3, None, None, None, None, None, None, np.float32 ]) DENS_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ) EMIT_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ) Bx_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ) By_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ) Bz_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ) MAP_buf = cl.Buffer(context, mf.WRITE_ONLY, 4 * 4 * NPIX) # space for (I, Q, U, NH) # cl.enqueue_copy(queue, DENS_buf, np.asarray(nH, np.float32)) cl.enqueue_copy(queue, EMIT_buf, np.asarray(Snu, np.float32)) cl.enqueue_copy(queue, Bx_buf, np.asarray(Bx, np.float32)) cl.enqueue_copy(queue, By_buf, np.asarray(By, np.float32)) cl.enqueue_copy(queue, Bz_buf, np.asarray(Bz, np.float32)) opos = clarray.vec.make_float3(obspos[0], obspos[1], obspos[2]) extGL = ext * dx # extinction per grid unit instead of per pc kernel_map(queue, [ GLOBAL, ], [ LOCAL, ], extGL, opos, DENS_buf, EMIT_buf, Bx_buf, By_buf, Bz_buf, MAP_buf, y_shear) MAP = np.zeros(4 * NPIX, np.float32) cl.enqueue_copy(queue, MAP, MAP_buf) MAP.shape = (NPIX, 4) return MAP[:, 0], MAP[:, 1], MAP[:, 2], MAP[:, 3] # return I, Q, U, NH