def pytest_generate_tests_for_pyopencl(metafunc): class ContextGetter: def __init__(self, device): self.device = device def __call__(self): return cl.Context([device]) def __str__(self): return "<context getter for %s>" % self.device if ("device" in metafunc.funcargnames or "ctx_getter" in metafunc.funcargnames): arg_dict = {} for platform in cl.get_platforms(): if "platform" in metafunc.funcargnames: arg_dict["platform"] = platform for device in platform.get_devices(): if "device" in metafunc.funcargnames: arg_dict["device"] = device if "ctx_getter" in metafunc.funcargnames: arg_dict["ctx_getter"] = ContextGetter(device) metafunc.addcall(funcargs=arg_dict.copy(), id=", ".join("%s=%s" % (arg, value) for arg, value in arg_dict.iteritems())) elif "platform" in metafunc.funcargnames: for platform in cl.get_platforms(): metafunc.addcall( funcargs=dict(platform=platform), id=str(platform))
def __init__(self, cl_mode = True, cl_device = None): """Initialize the class. """ if cl_mode: import pyopencl as cl import pyopencl.array if cl_device == 'gpu': gpu_devices = [] for platform in cl.get_platforms(): try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU) except: pass self.ctx = cl.Context(gpu_devices) elif cl_device == 'cpu': cpu_devices = [] for platform in cl.get_platforms(): try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU) except: pass self.ctx = cl.Context([cpu_devices[0]]) else: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.device = self.ctx.get_info(cl.context_info.DEVICES)[0] self.device_type = self.device.type self.device_compute_units = self.device.max_compute_units self.cl_mode = cl_mode self.obs = [] self.samples = {}
def __init__(self, cl_mode=False, cl_device=None, sample_size=1000, cutoff=None, output_to_stdout=False, search=False, search_tolerance = 100, search_data_fit_only = False, annealing = False, debug_mumble = False): """Initialize the class. """ if debug_mumble: logging.basicConfig(level=logging.INFO) if cl_mode: import pyopencl as cl import pyopencl.array, pyopencl.tools, pyopencl.clrandom if cl_device == 'gpu': gpu_devices = [] for platform in cl.get_platforms(): try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU) except: pass self.ctx = cl.Context(gpu_devices) elif cl_device == 'cpu': cpu_devices = [] for platform in cl.get_platforms(): try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU) except: pass self.ctx = cl.Context([cpu_devices[0]]) else: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.device = self.ctx.get_info(cl.context_info.DEVICES)[0] self.device_type = self.device.type self.device_compute_units = self.device.max_compute_units self.cl_mode = cl_mode self.cutoff = cutoff self.data = [] self.N = 0 # number of data points # sampling parameters self.sample_size = sample_size self.output_to_stdout = output_to_stdout self.iteration = 0 self.thining = 1 self.burnin = 0 self.gpu_time = 0 self.total_time = 0 # stochastic search parameters self.best_sample = (None, None, None) # (sample, logprobability of model, loglikelihood of data) self.search = search self.search_data_fit_only = search_data_fit_only self.best_diff = [] self.no_improv = 0 self.search_tolerance = search_tolerance # annealing parameters, if used self.annealing = annealing self.annealing_temp = 1 self.debug_mumble = debug_mumble
def __init__(self, gpuOnly=True, sharedGlContext=False, hidePlatformDetails=False): super(BaseCalculator, self).__init__() self.platform = cl.get_platforms()[0] self.devices = self.platform.get_devices() if not hidePlatformDetails: for platform in cl.get_platforms(): for device in platform.get_devices(): print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) print("Device max work group size:", device.max_work_group_size) print("Device max work item sizes:", device.max_work_item_sizes) properties = None if sharedGlContext: assert cl.have_gl() properties = get_gl_sharing_context_properties() devices = self.devices if gpuOnly and len(self.devices) > 1: devices = [self.devices[1]] self.context = cl.Context(properties=properties, devices=devices) self.queue = None
def __getOpenClDevice(self, platformId, deviceId): if pyopencl is None: return None if not (0 <= platformId < len(pyopencl.get_platforms())): return None platform = pyopencl.get_platforms()[platformId] if not (0 <= deviceId < len(platform.get_devices())): return None return platform.get_devices()[deviceId]
def __init__(self,threads=0,platform_directory_string="Platforms/OpenCLGPU/opencl_code/",root_directory_string="../../..",platform_name="",device_type=pyopencl.device_type.GPU): self.threads = threads self.platform_directory_string = platform_directory_string self.root_directory_string = root_directory_string self.platform_name = platform_name self.platform = None flag = False for p in pyopencl.get_platforms(): for d in p.get_devices(): if(self.platform_name in str(p).lower() and d.get_info(pyopencl.device_info.TYPE)==device_type): self.platform = p self.device_type = device_type flag = True break if(flag): break if not(self.platform): #If the preferred platform isn't available, just take the first one with the preferred device type for p in pyopencl.get_platforms(): for d in p.get_devices(): if(d.get_info(pyopencl.device_info.TYPE)==device_type): self.platform = p self.device_type = device_type flag = True break if(flag): break if not(self.platform): #Failing that, just take the first one that has a CPU and use that for p in pyopencl.get_platforms(): for d in p.get_devices(): if(d.get_info(pyopencl.device_info.TYPE)==pyopencl.device_type.CPU): self.platform = p self.device_type = pyopencl.device_type.CPU flag = True break if(flag): break self.platform_name = self.platform.get_info(pyopencl.platform_info.VENDOR) #if("Advanced Micro Devices" in self.platform_name): self.platform_name = self.platform.get_info(pyopencl.platform_info.NAME) self.device = self.platform.get_devices(self.device_type)[0] #Takes the first device available for the specified platform and type #except: #If the preferred device type isn't available, just take the first available CPU to that platform #self.device_type = pyopencl.device_type.CPU #self.device = self.platform.get_devices(pyopencl.device_type.CPU)[0] self.context = pyopencl.Context(devices=[self.device]) self.amd_gpu_flag = False if((("AMD" in self.platform_name) and (self.device_type==pyopencl.device_type.GPU)) or self.amd_gpu_flag): self.cpu_device = self.platform.get_devices(pyopencl.device_type.CPU)[0] #Taking the first CPU available, needed for AMD GPUs self.cpu_context = pyopencl.Context(devices=[self.cpu_device]) self.amd_gpu_flag = True
def compute(trans_matrix, config_vector, validapps, num_valid_apps): # computation device = cl.get_platforms()[1].get_devices()[0] # print device.max_work_item_sizes ctx = cl.Context([device]) platform = cl.get_platforms()[1] device = platform.get_devices()[0] queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) trans_np = np.array(trans_matrix, dtype = np.integer).flatten() config_vector_np = np.array(config_vector, dtype=np.integer) validapps_np = np.array(validapps, dtype=np.integer) result_config_vectors_np = np.empty(num_valid_apps * row).astype(np.integer) kernel = """ __kernel void compute(__global int* trans_matrix, __global int* config_vector, __global int* validapps, __global int* result_config_vectors){ int dot_result[COL_SIZE]; int grpid = get_group_id(0); if(get_local_id(0) == 0) { for(int j = 0; j < COL_SIZE; j ++) { int sum = 0; for(int i = 0; i < ROW_SIZE; i ++) sum += validapps[grpid * ROW_SIZE + i] * trans_matrix[i * COL_SIZE + j]; dot_result[j] = sum; } for(int i = 0; i < COL_SIZE; i ++) result_config_vectors[grpid * COL_SIZE + i] = config_vector[i] + dot_result[i]; } } """ mat_size = "#define MAT_SIZE " + str(len(trans_np)) + '\n' column_size = "#define COL_SIZE " + str(row) + '\n' row_size = "#define ROW_SIZE " + str(col) + '\n' kernel = mat_size + column_size + row_size + kernel program = cl.Program(ctx, kernel).build() queue = cl.CommandQueue(ctx) # create memory buffers mf = cl.mem_flags trans_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = trans_np) config_vector_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = config_vector_np) validapps_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = validapps_np) result_config_vectors_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result_config_vectors_np.nbytes) # execute the kernel program.compute(queue, validapps_np.shape, (col, ), trans_buf, config_vector_buf, validapps_buf, result_config_vectors_buf) cl.enqueue_copy(queue, result_config_vectors_np, result_config_vectors_buf) return result_config_vectors_np
def __init__(self,threads=0,platform_directory_string="Platforms/OpenCLGPU/opencl_code",root_directory_string=None,platform_name="",device_type=pyopencl.device_type.GPU,ssh_alias="",remote=False,hostname=None): """Constructor Parameters platform_directory_string, root_directory_String, ssh_alias, remote, hostname - same as Platform class platform_name - (string) name of OpenCL SDK to use device_type - (pyopencl.device_type) OpenCL device type to use """ self.threads = threads Platform.Platform.__init__(self,platform_directory_string,root_directory_string,ssh_alias,remote,hostname) self.platform_name = platform_name self.platform = None #Selecting the specified platform and device flag = False for p in pyopencl.get_platforms(): for d in p.get_devices(): if(self.platform_name in str(p).lower() and d.get_info(pyopencl.device_info.TYPE)==device_type): self.platform = p self.device_type = device_type flag = True break if(flag): break if not(self.platform): #If the preferred platform isn't available, just take the first one with the preferred device type for p in pyopencl.get_platforms(): for d in p.get_devices(): if(d.get_info(pyopencl.device_info.TYPE)==device_type): self.platform = p self.device_type = device_type flag = True break if(flag): break if not(self.platform): #Failing that, just take the first one that has a CPU and use that for p in pyopencl.get_platforms(): for d in p.get_devices(): if(d.get_info(pyopencl.device_info.TYPE)==pyopencl.device_type.CPU): self.platform = p self.device_type = pyopencl.device_type.CPU flag = True break if(flag): break self.platform_name = self.platform.get_info(pyopencl.platform_info.VENDOR) self.device = self.platform.get_devices(self.device_type)[0] #Takes the first device available for the specified platform and type self.context = pyopencl.Context(devices=[self.device]) """
def get_devices(): if len(cl.get_platforms()) > 1: for found_platform in cl.get_platforms(): if found_platform.name == 'NVIDIA CUDA': my_platform = found_platform print("Selected platform:", my_platform.name) else: my_platform = cl.get_platforms()[0] devices = {} for device in my_platform.get_devices(): devices[cl.device_type.to_string(device.type)] = device return devices
def get_test_platforms_and_devices(plat_dev_string=None): """Parse a string of the form 'PYOPENCL_TEST=0:0,1;intel:i5'. :return: list of tuples (platform, [device, device, ...]) """ if plat_dev_string is None: import os plat_dev_string = os.environ.get("PYOPENCL_TEST", None) def find_cl_obj(objs, identifier): try: num = int(identifier) except Exception: pass else: return objs[num] found = False for obj in objs: if identifier.lower() in (obj.name + ' ' + obj.vendor).lower(): return obj if not found: raise RuntimeError("object '%s' not found" % identifier) if plat_dev_string: result = [] for entry in plat_dev_string.split(";"): lhsrhs = entry.split(":") if len(lhsrhs) == 1: platform = find_cl_obj(cl.get_platforms(), lhsrhs[0]) result.append((platform, platform.get_devices())) elif len(lhsrhs) != 2: raise RuntimeError("invalid syntax of PYOPENCL_TEST") else: plat_str, dev_strs = lhsrhs platform = find_cl_obj(cl.get_platforms(), plat_str) devs = platform.get_devices() result.append( (platform, [find_cl_obj(devs, dev_id) for dev_id in dev_strs.split(",")])) return result else: return [ (platform, platform.get_devices()) for platform in cl.get_platforms()]
def __init__(self, nBands, cType, isFloat): # Get opencl devices and count devices = [j for i in cl.get_platforms() for j in i.get_devices()] self.nDevices = len(devices) self.inBuffer = queue.Queue(self.nDevices) self.outBuffer = queue.Queue(self.nDevices) #Create context for each device contexts = [cl.Context([device]) for device in devices] #Compile the program for each context cSrcCode = cSrc.format(nBands, cType, int(isFloat)) programs = [cl.Program(context, cSrcCode) for context in contexts] [program.build() for program in programs] # Queues for contexts queues = [cl.CommandQueue(context) for context in contexts] #Create a processingUnit for each program/context/queue self.workerExec = [ ProcessingUnit( programs[i], contexts[i], queues[i], self.inBuffer, self.outBuffer ) for i in range(self.nDevices) ] for i in self.workerExec: i.start()
def __init__(self): plats = cl.get_platforms() ctx_props = cl.context_properties self.props = [(ctx_props.PLATFORM, plats[0]), (ctx_props.GL_CONTEXT_KHR, platform.GetCurrentContext())] if sys.platform == "linux2": self.props.append((ctx_props.GLX_DISPLAY_KHR, GLX.glXGetCurrentDisplay())) elif sys.platform == "win32": self.props.append((ctx_props.WGL_HDC_KHR, WGL.wglGetCurrentDC())) self.ctx = cl.Context(properties=self.props) self.cross4 = ElementwiseKernel( self.ctx, "__global const float4 *u, " "__global const float4 *v, " "__global const float4 *w, " "__global float4 *r", "r[i] = cross4(u[i],v[i],w[i])", "cross4_final", preamble=cross4_preamble) self.distance2 = ElementwiseKernel( self.ctx, "__global const float4 *a, " "__global const float4 *b, " "__global float4 *d", "d[i] = distance2(a[i],b[i])", "distance_final", preamble=distance_preamble) self.place_hyperspheres()
def __init__(self, device_index, options): super(OpenCLMiner, self).__init__(device_index, options) self.output_size = 0x100 self.defspace = '' self.platform = cl.get_platforms()[options.platform] if self.platform.name == 'Apple': self.defspace = ' ' self.device = self.platform.get_devices()[device_index] self.device_name = self.device.name.strip('\r\n \x00\t') self.gpu_amd = 0 if self.device.type == cl.device_type.GPU and is_amd(self.device.platform): self.gpu_amd = 1 self.frames = 30 self.worksize = self.frameSleep= self.rate = self.estimated_rate = 0 self.adapterIndex = None if ADL and is_amd(self.device.platform) and self.device.type == cl.device_type.GPU: with adl_lock: self.adapterIndex = self.get_adapter_info() if self.adapterIndex: self.adapterIndex = self.adapterIndex[self.device_index].iAdapterIndex self.temperature = 0 self.target6 = 0 self.target7 = 0
def InitCL(self, DEVICE="CPU"): try: for platform in cl.get_platforms(): for device in platform.get_devices(): if cl.device_type.to_string(device.type)== DEVICE: my_device = device print my_device.name, " ", cl.device_type.to_string(my_device.type) except: my_device = cl.get_platforms()[0].get_devices() print my_device.name, " ", cl.device_type.to_string(my_device.type) self.ctx = cl.Context([my_device]) self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags
def test_opencl_0(zz, a, b, c_result): for platform in cl.get_platforms(): for device in [platform.get_devices()[1]]: print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) # Simnple speed test ctx = cl.Context([device]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) prg = cl.Program(ctx, """ __kernel void sum(__global const double *a, __global const double *b, __global double *c) { int loop; int gid = get_global_id(0); for(loop=0; loop<%s;loop++) { c[gid] = a[gid] + b[gid]; c[gid] = c[gid] * (a[gid] + b[gid]); c[gid] = c[gid] * (a[gid] / 2); c[gid] = log(exp(c[gid])); } } """ % (zz)).build() exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) print("Execution time of test: %g s" % elapsed) c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() error = 0 for i in range(zz): if c[i] != c_result[i]: print("c_i: ", c[i], " c_results_i: ", c_result[i]) print("diff: ", numpy.abs(c[i] - c_result[i])) error = 1 if error: print("Results doesn't match!!") else: print("Results OK")
def get_cl_context(gl_context): """Creates a CL context, with or without given GL context.""" if gl_context is not None: # ... with OpenGL interop? with gl_context: assert cl.have_gl(), "GL interoperability not enabled." from pyopencl.tools import get_gl_sharing_context_properties cl_platform = cl.get_platforms()[0] cl_properties = [(cl.context_properties.PLATFORM, cl_platform)] + get_gl_sharing_context_properties() cl_devices = [cl_platform.get_devices()[-1]] # Only one is allowed! cl_context = cl.Context(properties=cl_properties, devices=cl_devices) else: # ... or in stand-alone mode, CL context without GL? cl_platform = cl.get_platforms()[0] # @UndefinedVariable cl_properties = [(cl.context_properties.PLATFORM, cl_platform)] cl_devices = [cl_platform.get_devices()[-1]] # Only one is allowed! cl_context = cl.Context(properties=cl_properties, devices=cl_devices) return cl_context
def run(self, args): device_type = cl.device_type.ALL if args.device_type == 'cpu': device_type = cl.device_type.CPU elif args.device_type == 'gpu': device_type = cl.device_type.GPU platform = cl.get_platforms()[0] devices = platform.get_devices(device_type=device_type) context = cl.Context(devices=devices) queue = cl.CommandQueue(context) simulator = physics.Simulator(context, queue, num_worlds=args.num_worlds, num_robots=args.num_robots, ta=args.ta, tb=args.tb, test=False, random_targets=args.random_targets) if args.params is not None: pos = args.params.decode('hex') else: pos = '' for i in xrange(physics.ANN_PARAMS_SIZE): pos += chr(random.randint(0,255)) decoded = np.zeros(len(pos)) for i in xrange(len(pos)): decoded[i] = float(ord(pos[i])) / 255 if args.save is None: fitness = simulator.simulate([ decoded for i in xrange(args.num_worlds) ], targets_distance=args.targets_distance, targets_angle=args.targets_angle) else: fitness = simulator.simulate_and_save(args.save, [ decoded for i in xrange(args.num_worlds) ], targets_distance=args.targets_distance, targets_angle=args.targets_angle) print 'fitness = ', fitness
def __init__(self, filename, *args, **kwargs): plats = cl.get_platforms() from pyopencl.tools import get_gl_sharing_context_properties import sys if sys.platform == "darwin": self.ctx = cl.Context(properties=get_gl_sharing_context_properties(), devices=[]) else: self.ctx = cl.Context( properties=[(cl.context_properties.PLATFORM, plats[0])] + get_gl_sharing_context_properties(), devices=None, ) self.queue = cl.CommandQueue(self.ctx) self.loadProgram(filename) self.gl_objects = [] # TODO get these from kwargs self.kernelargs = None self.global_size = (0,) self.local_size = None self.PreExecute = None self.PostExecute = None self.kernelname = filename.split(".")[0]
def __init__(self, parent=None): super().__init__(parent) self.setupUi(self) self.axSlider.valueChanged.connect(self.setAx) self.sxSlider.valueChanged.connect(self.setSx) self.aSlider.valueChanged.connect(lambda x: self.setA(x/1000)) self.hSlider.valueChanged.connect(lambda x: self.setH(x/1000)) self.showOriginal = False self.loadButton.clicked.connect(lambda: self.loadImage(QFileDialog.getOpenFileName(self, "Open Image"))) self.saveButton.clicked.connect(lambda: self.saveImage(QFileDialog.getSaveFileName(self, "Save Image", filter="*.png"))) self.toggleOriginalButton.clicked.connect(self.toggleImage) platform = choose(self, cl.get_platforms(), "OpenCL Platform", "Please choose OpenCL Platform") print(platform) device = choose(self, platform.get_devices(), "OpenCL Device", "Please choose OpenCL Device") print(device) ctx = cl.Context([device]) self.nlmeans = NLMeans(ctx) self.imageLabel = QLabel() self.imageScrollarea.setWidget(self.imageLabel) self.maskScrollarea.setAlignment(Qt.AlignCenter) self.maskLabel = QLabel() self.maskScrollarea.setWidget(self.maskLabel) self.showMask() self.loadImage("lena.jpg") self.resetButton.clicked.connect(self.resetParameters) self.resetParameters()
def __init__(self, max_elements, cta_size, dtype): self.WARP_SIZE = 32 self.SCAN_WG_SIZE = 256 self.MIN_LARGE_ARRAY_SIZE = 4 * self.SCAN_WG_SIZE self.bit_step = 4 self.cta_size = cta_size self.uintsz = dtype.itemsize plat = cl.get_platforms()[0] device = plat.get_devices()[0] self.ctx = cl.Context(devices=[device]) self.queue = cl.CommandQueue(self.ctx, device) self.loadProgram() if (max_elements % (cta_size * 4)) == 0: num_blocks = max_elements / (cta_size * 4) else: num_blocks = max_elements / (cta_size * 4) + 1 #print "num_blocks: ", num_blocks self.d_tempKeys = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * max_elements) self.d_tempValues = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * max_elements) self.mCounters = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks) self.mCountersSum = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks) self.mBlockOffsets = cl.Buffer(self.ctx, mf.READ_WRITE, size=self.uintsz * self.WARP_SIZE * num_blocks) numscan = max_elements/2/cta_size*16 #print "numscan", numscan if numscan >= self.MIN_LARGE_ARRAY_SIZE: #MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE 1024 self.scan_buffer = cl.Buffer(self.ctx, mf.READ_WRITE, size = self.uintsz * numscan / 1024)
def init_cl(self, platnum, devnum): # Check that specified platform exists platforms = cl.get_platforms() if len(platforms) <= platnum: print "Specified OpenCL platform number (%d) does not exist." print "Options are:" for p in range(len(platforms)): print "%d: %s" % (p, str(platforms[p])) return False else: platform = platforms[platnum] # Check that specified device exists on that platform devices = platforms[platnum].get_devices() if len(devices) <= devnum: print "Specified OpenCL device number (%d) does not exist on platform %s." % (devnum, platform) print "Options are:" for d in range(len(devices)): print "%d: %s" % (d, str(devices[d])) return False else: device = devices[devnum] # Create a context and queue self.CLContext = cl.Context(properties=[(cl.context_properties.PLATFORM, platform)], devices=[device]) self.CLQueue = cl.CommandQueue(self.CLContext) print "Set up OpenCL context:" print " Platform: %s" % (str(platform.name)) print " Device: %s" % (str(device.name)) return True
def _enumerate_cl_devices_for_ref_test(): import pyopencl as cl noncpu_devs = [] cpu_devs = [] for pf in cl.get_platforms(): for dev in pf.get_devices(): if dev.type & cl.device_type.CPU: cpu_devs.append(dev) else: noncpu_devs.append(dev) if not (cpu_devs or noncpu_devs): raise LoopyError("no CL device found for test") if not cpu_devs: warn("No CPU device found for running reference kernel. The reference " "computation will either fail because of a timeout " "or take a *very* long time.") for dev in cpu_devs: yield dev for dev in noncpu_devs: yield dev
def get_OCL_context(): """ Retrieves the OpenCL context """ if not pyopencl: raise RuntimeError("OpenCL unuseable") ctx = None if sys.platform == "darwin": ctx = pyopencl.Context(properties=get_gl_sharing_context_properties(), devices=[]) else: # Some OSs prefer clCreateContextFromType, some prefer # clCreateContext. Try both and loop. for platform in pyopencl.get_platforms(): try: ctx = pyopencl.Context(properties=[ (pyopencl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties()) except: for device in platform.get_devices(): try: ctx = pyopencl.Context(properties=[ (pyopencl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties(), devices=[device]) except: ctx = None else: break else: break if ctx: break return ctx
def init_opencl(self): platforms = cl.get_platforms() print 'The platforms detected are:' print '---------------------------' for platform in platforms: print platform.name, platform.vendor, 'version:', platform.version # List devices in each platform for platform in platforms: print 'The devices detected on platform', platform.name, 'are:' print '---------------------------' for device in platform.get_devices(): print device.name, '[Type:', cl.device_type.to_string(device.type), ']' print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz' print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB' print 'Maximum work group size', device.max_work_group_size print 'Maximum work item dimensions', device.max_work_item_dimensions print 'Maximum work item size', device.max_work_item_sizes print '---------------------------' # Create a context with all the devices devices = platforms[0].get_devices() self.context = cl.Context(devices) print 'This context is associated with ', len(self.context.devices), 'devices' self.queue = cl.CommandQueue(self.context, self.context.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE) self.kernels = cl.Program(self.context, open(file_dir + '/D2Q9.cl').read()).build(options='')
def __init__(self, coords, values, wantCL=True, platform_num=None): """ Take the coordinates and values and build a KD tree. Keyword arguments: coords -- input coordinates (x, y) values -- input values """ self.coords = np.asarray(coords, dtype=np.float32) self.values = np.asarray(values, dtype=np.int32) if self.coords.shape[0] != self.values.shape[0]: raise AssertionError('lencoords does not equal lenvalues') self.wantCL = wantCL self.canCL = False if hasCL and self.wantCL: try: platforms = cl.get_platforms() try: platform = platforms[platform_num] self.devices = self.platform.get_devices() self.context = cl.Context(self.devices) except TypeError: # The user may be asked to select a platform. self.context = cl.create_some_context() self.devices = self.context.devices except IndexError: raise self.queue = cl.CommandQueue(self.context) filestr = ''.join(open('idt.cl', 'r').readlines()) self.program = cl.Program(self.context, filestr).build(devices=self.devices) for device in self.devices: buildlog = self.program.get_build_info(device, cl.program_build_info.LOG) if (len(buildlog) > 1): print 'Build log for device', device, ':\n', buildlog # Only the first kernel is used. self.kernel = self.program.all_kernels()[0] # Local and global sizes are device-dependent. self.local_size = {} self.global_size = {} # Groups should be overcommitted. # For now, use 3 (48 cores / 16 cores per halfwarp) * 2 for device in self.devices: work_group_size = self.kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device) num_groups_for_1d = device.max_compute_units * 3 * 2 self.local_size[device] = (work_group_size,) self.global_size[device] = (num_groups_for_1d * work_group_size,) self.canCL = True except cl.RuntimeError: print 'warning: unable to use pyopencl, defaulting to cKDTree' if self.canCL: self.tree = build_tree(coords) else: self.tree = KDTree(coords)
def main(): # Get module name to load if len(sys.argv)<2: print "Please specify a model (.py) file" exit(0) else: moduleName = sys.argv[1] # Get OpenCL platform/device numbers if len(sys.argv)<3: # User input of OpenCL setup import pyopencl as cl # Platform platforms = cl.get_platforms() print "Select OpenCL platform:" for i in range(len(platforms)): print 'press '+str(i)+' for '+str(platforms[i]) platnum = int(input('Platform Number: ')) # Device devices = platforms[platnum].get_devices() print "Select OpenCL device:" for i in range(len(devices)): print 'press '+str(i)+' for '+str(devices[i]) devnum = int(input('Device Number: ')) else: platnum = int(sys.argv[2]) devnum = int(sys.argv[3]) # Set up complete, now run the simulation simulate(moduleName, platnum, devnum)
def save_device_fetch(type): devices = [] for platform in cl.get_platforms(): devices = devices + platform.get_devices(device_type=type) # Just use the first GPU device, they are all good return [devices[0]]
def __init__(self, locCard, plots, outPlotQueues, alarmQueue, idLoc): self.status = -1 if locCard.backend.startswith('shadow'): self.runDir = locCard.cwd + os.sep + 'tmp' + str(idLoc) # self.name = 'klmn' + self.name # if _DEBUG: # print self.name # print os.getpid() self.idN = idLoc self.status = 0 self.plots = plots self.outPlotQueues = outPlotQueues self.alarmQueue = alarmQueue self.card = locCard isOpenCL = False self.cl_ctx = None if isOpenCL: iDevice = None for platform in cl.get_platforms(): for device in platform.get_devices(): if device.type == 2: iDevice = device break if iDevice is not None: break if iDevice is not None: self.cl_ctx = cl.Context(devices=[iDevice]) self.cl_queue = cl.CommandQueue(self.cl_ctx) cl_file = os.path.join(__dir__, r'hist.cl') with open(cl_file, 'r') as f: kernelsource = f.read() self.cl_program = cl.Program(self.cl_ctx, kernelsource).build() self.cl_mf = cl.mem_flags
def init(platform_name=None, device_index=None, profiling=False, profiling_file='profile.dat', loglevel=logging.INFO, logfile=None, double_precision=False): """Initialize syris with *device_index*.""" cfg.init_logging(level=loglevel, logger_file=logfile) cfg.PRECISION = cfg.Precision(double_precision) cfg.OPENCL = cfg.OpenCL() platforms = [] try: platforms = cl.get_platforms() except Exception as e: LOG.exception(str(e)) else: if not platforms: LOG.warning('No OpenCL platforms found, GPU computing will not be available') else: make_opencl_defaults(platform_name=platform_name, device_index=device_index, profiling=profiling) if profiling: _wrap_opencl() prf.PROFILER = prf.Profiler(cfg.OPENCL.queues, profiling_file) prf.PROFILER.start() @atexit.register def exit_handler(): """Shutdown the profiler on exit.""" prf.PROFILER.shutdown() if platforms: init_programs()
def cl_init(type = 'GPU'): if type == 'GPU': my_type = cl.device_type.GPU elif type == 'CPU': my_type = cl.device_type.CPU try: platform = cl.get_platforms()[0] devices = platform.get_devices(device_type=my_type) ctx = cl.Context(devices = devices) except: ctx = cl.create_some_context(interactive=True) device = devices[0] print("===============================================================") print("Platform name: " + platform.name) print("Platform vendor: " + platform.vendor) print("Platform version: " + platform.version) print("---------------------------------------------------------------") print("Device name: " + device.name) print("Device type: " + cl.device_type.to_string(device.type)) print("Local memory: " + str(device.local_mem_size//1024) + ' KB') print("Device memory: " + str(device.global_mem_size//1024//1024) + ' MB') print("Device max clock speed:" + str(device.max_clock_frequency) + ' MHz') print("Device compute units:" + str(device.max_compute_units)) return ctx
def offsetData(data, offset=0): shape = (len(data), len(data[0]), 3) h, w, dim = shape result = np.empty(h * w * dim, dtype=np.float32) # read data as floats data = np.array(data) data = data.astype(np.float32) # convert to 1-dimension data = data.reshape(-1) # the kernel function src = """ __kernel void offsetData(__global float *dataIn, __global float *result){ int w = %d; int dim = %d; int offsetX = %d; // get current position int posx = get_global_id(1); int posy = get_global_id(0); // convert position from 0,360 to -180,180 int posxOffset = posx; if (offsetX > 0 || offsetX < 0) { if (posx < offsetX) { posxOffset = posxOffset + offsetX; } else { posxOffset = posxOffset - offsetX; } } // get indices int i = posy * w * dim + posxOffset * dim; int j = posy * w * dim + posx * dim; // set result result[j] = dataIn[i]; result[j+1] = dataIn[i+1]; result[j+2] = dataIn[i+2]; } """ % (w, dim, offset) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print "Warning: using CPU" ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() dataIn = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) prg.offsetData(queue, [h, w], None, dataIn, outResult) # Copy result cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) return result
def get_platforms(): return cl.get_platforms()
from __future__ import print_function import pyopencl as cl from pyopencl import array try: from pyopencl import cltypes except ImportError: from ..utils import cltypes import numpy as np print(cl.get_platforms()) kernel_src = """ /** * Updates the table for every bit in every step * **/ __kernel void learn( __global const uint* activeBitIdx, // work size is the number of activations, array of active bits indices in the input __global float* averages, __global uint* count, float const alpha, // moving average alpha float const actualValue, // actual input value from the PF uint const bucketIdx, // bucket that actualValue falls into uint const bucketCount // number of buckets ) { const int gid = get_global_id(0); const int n = activeBitIdx[gid]; // each job updates the table for a single active bit of the input const int nbI = n*bucketCount + bucketIdx; // increment the active count for this bit's bucket averages[nbI] = ((1-alpha)*averages[nbI]) + alpha * actualValue;
import pyopencl as cl import pyopencl.array as cl_array import pyopencl.cltypes as cltypes import numpy import pytest import rk_pd_4d platform = next(platform for platform in cl.get_platforms() if platform.name == 'Intel(R) OpenCL') device = platform.get_devices() context = cl.Context(device) # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue @pytest.mark.parametrize( 'initials, t0, t1, derived_function, expected, delta_absolute_error, absolute_error, relative_error, expected_error_runge_kutta', [ (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0) ]), 0.0, 1.0, '1.0, 1.0, 1.0, 1.0', numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, 1.0)]), 1e-18, 1e-18, 1e-18, numpy.array([numpy.double(0.0)])), (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0) ]), 0.0, 1.0, '1.0, 1.0, 2.0 * Y->x, - 2.0 * Y->y', numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, -1.0)]), 2.3e-16, 2.3e-16, 1e-18, numpy.array([numpy.double(0.0)])), (numpy.array([cltypes.make_double4(0.0, 0.0, 0.0, 0.0)]), 0.0, 1.0, '1.0, 1.0, 3.0 * Y->x * Y->x, - 3.0 * Y->y * Y->y', numpy.array([cltypes.make_double4(1.0, 1.0, 1.0, -1.0)]), 3.2e-16,
def getTemperatureImage(data, p): tRange = p["temperature_range"] gradient = p["gradient"] dataG = np.array(gradient) dataG = dataG.astype(np.float32) shape = data.shape h, w, dim = shape data = data.reshape(-1) dataG = dataG.reshape(-1) # the kernel function src = """ __kernel void lerpImage(__global float *d, __global float *grad, __global uchar *result){ int w = %d; int dim = %d; int gradLen = %d; float minValue = %f; float maxValue = %f; // get current position int posx = get_global_id(1); int posy = get_global_id(0); // get index int i = posy * w * dim + posx * dim; float temperature = d[i]; int r = 45; int g = 50; int b = 55; // assume large values are invalid if (temperature > -99.0 && temperature < 99.0) { // normalize the temperature float norm = (temperature - minValue) / (maxValue - minValue); // clamp if (norm > 1.0) { norm = 1.0; } if (norm < 0.0) { norm = 0.0; } // get color from gradient int gradientIndex = (int) round(norm * (gradLen-1)); gradientIndex = gradientIndex * 3; r = (int) round(grad[gradientIndex] * 255); g = (int) round(grad[gradientIndex+1] * 255); b = (int) round(grad[gradientIndex+2] * 255); } // set the color result[i] = r; result[i+1] = g; result[i+2] = b; } """ % (w, dim, len(gradient), tRange[0], tRange[1]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: # print "Using GPU" ctx = cl.Context(devices=GPUs) else: print "Warning: using CPU" ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data) inG = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataG) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, (data.astype(np.uint8)).nbytes) prg.lerpImage(queue, [h, w], None, inData, inG, outResult) # Copy result result = np.empty_like(data) result = result.astype(np.uint8) cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) imOut = Image.fromarray(result, mode="RGB") return imOut
class MergeSort: NAME = 'NVIDIA CUDA' platforms = cl.get_platforms() devs = None for platform in platforms: if platform.name == NAME: devs = platform.get_devices() ctx = cl.Context(devs) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) tile_x = np.int(32) tile_y = np.int(1) def merge_sort_serial(self, a_cpu): # a_cpu: an array generated in cpu. # return: the sorted array of a_cpu. a_length = len(a_cpu) #Base case if a_length <= 1: return a_cpu #Recursive Case a_mid = int(a_length / 2) left = np.array(a_cpu[0:a_mid]) right = np.array(a_cpu[a_mid:a_length]) #Recursively Sort merge_sort = MergeSort() left = merge_sort.merge_sort_serial(left) right = merge_sort.merge_sort_serial(right) return merge_sort.merge_serial(left, right) def merge_serial(self, left_cpu, right_cpu): #initialize result = [] # while not empty while (len(left_cpu) > 0 and len(right_cpu) > 0): left_first = left_cpu[0] right_first = right_cpu[0] # print(left_first, type(left_first), right_first, type(right_first)) if (left_first <= right_first): result.append(left_cpu[0]) left_cpu = np.array(left_cpu[1:len(left_cpu)]) else: result.append(right_cpu[0]) right_cpu = np.array(right_cpu[1:len(right_cpu)]) # consume other when one is empty if len(left_cpu) == 0: result = np.concatenate((result, right_cpu)) elif len(right_cpu) == 0: result = np.concatenate((result, left_cpu)) else: print("length error") return result #%% merge_sort_naive_kernel_code = """ __kernel void Merge_sort_naive(__global float* a, __global float* a_temp, __global float* c, const unsigned int a_length) { //-----initialize----- int tx = get_local_id(0); int bx = get_group_id(0); int col = bx * get_local_size(0) + tx; const int a_len = a_length; const int block_size = 32; //-----iterate stride and tile_shift----- for (int stride =1; stride<a_len; stride*=2){ int shift_count = (a_len-1)/(block_size*stride*2)+1; for (int tile_shift= 0; tile_shift < shift_count; tile_shift++){ int beginning = col * stride *2 + tile_shift * stride * 2 * block_size; int middle = beginning + stride; int end = middle + stride; if (beginning>= a_len) continue; //alter middle and end if necessary if (end>a_len){ end = a_len; } if (middle > a_len){ middle = a_len; } int temp_distance_1 = middle - beginning; int temp_distance_2 = end - middle; //merge int m = 0; int n = 0; while (m<temp_distance_1 && n<temp_distance_2){ if (a[beginning+m] < a[middle+n]){ a_temp[beginning+m+n]=a[beginning+m]; m++; } else if (a[beginning+m] >= a[middle+n]){ a_temp[beginning+m + n]=a[middle+n]; n++; } } //put in the rest of arr2 if (n<temp_distance_2){ while (n<temp_distance_2){ a_temp[beginning+m+n] = a[middle+n]; n++; } } if (m<temp_distance_1){ while (m<temp_distance_1){ a_temp[beginning+m+n] = a[beginning+m]; m++; } } barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE); for (int j=0; j<end-beginning; j++){ a[beginning+j] = a_temp[beginning+j]; a_temp[beginning+j] = 0; //set to zero to clean } float min_temp = a[beginning]; float max_temp = a[end]; //printf("%d, %d, %d | %f, %f \\n", beginning, middle, end, min_temp, max_temp); barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE); } } for (int k=0; k<a_len; k++){ c[k] = a[k]; } barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE); } """ #%% merge_sort_optimized1_kernel_code = """ __kernel void Merge_sort_optimized1(__global float* a, __global float* c, const unsigned int a_length) { //initialize int tx = get_local_id(0); int bx = get_group_id(0); int col = bx * get_local_size(0) + tx; const int a_len = a_length; //-----load a array into shared memory----- __local float a_shared[1024]; if (col<a_length){ a_shared[col] = a[col]; } barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE); //-----sort----- //-----set stride----- for (int stride = 1; stride < a_len; stride *= 2){ int beginning = col * stride *2; //test int middle = beginning + stride; int end = middle + stride; if(beginning>=a_len) continue; //-----watch for edge cases of beginning, middle, or end larger than a_length----- //alter middle and end if necessary if (end>a_len){ end = a_len; } if (middle>a_len){ middle = a_len; } int temp_distance_1 = middle-beginning; int temp_distance_2 = end - middle; //merge int m = 0; int n = 0; float a_temp[1024]; while (m<temp_distance_1 && n<temp_distance_2){ if (a_shared[beginning+m] < a_shared[middle+n]){ a_temp[beginning + m +n]=a_shared[beginning+m]; m++; } else if (a_shared[beginning+m] >= a_shared[middle+n]){ a_temp[beginning + m + n]=a_shared[middle+n]; n++; } } //put in the rest of arr2 if (n<temp_distance_2){ while (n<temp_distance_2){ a_temp[beginning+m+n] = a_shared[middle+n]; n++; } } if (m<temp_distance_1){ while (m<temp_distance_1){ a_temp[beginning+m+n] = a_shared[beginning+m]; m++; } } //put temp into shared for (int j=beginning; j<end; j++){ a_shared[j] = a_temp[j]; } float min_temp = a_shared[beginning]; float max_temp = a_shared[end]; // printf("%d, %d, %d | %f, %f \\n", beginning, middle, end, min_temp, max_temp); barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE); } for (int k=0; k<a_len; k++){ c[k] = a_shared[k]; } } """ #%% prg_merge_sort_naive = cl.Program(ctx, merge_sort_naive_kernel_code).build() prg_merge_sort_optimized1 = cl.Program( ctx, merge_sort_optimized1_kernel_code).build() #%% def __init__(self): self.a_gpu = None def prepare_data(self, a_cpu): if self.a_gpu is None: self.a_gpu = cl.array.to_device(MergeSort.queue, a_cpu) #%% def merge_sort_naive(self, a_cpu): print("-" * 80) print("Naive") a_length = len(a_cpu) minimum = min(a_length, 32) place_holder = a_cpu[0:minimum] self.prepare_data(a_cpu) place_holder_gpu = cl.array.empty(MergeSort.queue, place_holder.shape, a_cpu.dtype) c_naive_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape, a_cpu.dtype) b_naive_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape, a_cpu.dtype) evt = MergeSort.prg_merge_sort_naive.Merge_sort_naive( MergeSort.queue, place_holder_gpu.shape, place_holder_gpu.shape, self.a_gpu.data, b_naive_gpu.data, c_naive_gpu.data, np.int32(a_length)) evt.wait() time_naive = 1e-10 * (evt.profile.end - evt.profile.start) c_naive = c_naive_gpu.get() return c_naive, time_naive #%% def merge_sort_optimized1(self, a_cpu): print("-" * 80) print("Optimized") """different a_length version""" # a_length = np.array((len(a_cpu))).astype(np.int32) a_length = len(a_cpu) self.prepare_data(a_cpu) c_optimized_gpu = cl.array.empty(MergeSort.queue, a_cpu.shape, a_cpu.dtype) evt = MergeSort.prg_merge_sort_optimized1.Merge_sort_optimized1( MergeSort.queue, c_optimized_gpu.shape, c_optimized_gpu.shape, self.a_gpu.data, c_optimized_gpu.data, np.int32(a_length)) evt.wait() time_optimized = 1e-10 * (evt.profile.end - evt.profile.start) c_optimized = c_optimized_gpu.get() return c_optimized, time_optimized
knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" ) return knl n=128 r=3 k=0 norm2=1 norm=1 eps=1e-8 d=3 dimension=[n,n,n] plt = cl.get_platforms() nvidia_plat = plt[1] ctx = cl.Context(nvidia_plat.get_devices()) knl_get_tensor = get_tensor(ctx) knl_r_U = Prav_U(ctx) knl_r_V = Prav_V(ctx) knl_r_W = Prav_W(ctx) knl_l_U = left_U(ctx) knl_l_V = left_V(ctx) knl_l_W = left_W(ctx) cknl_r_U = lp.CompiledKernel(ctx, knl_r_U) cknl_r_V = lp.CompiledKernel(ctx, knl_r_V) cknl_r_W = lp.CompiledKernel(ctx, knl_r_W) cknl_l_U = lp.CompiledKernel(ctx, knl_l_U)
import pyopencl as cl import pytest import numpy as np from pyclesperanto_prototype import create_image DEVICES = [ device for platform in cl.get_platforms() for device in platform.get_devices() ] @pytest.fixture(params=DEVICES, ids=lambda x: x.name) def context(request): return cl.Context(devices=[request.param]) dtypes = { "int8", "int16", "int32", # 'int64', "uint8", "uint16", "uint32", # 'uint64', "float16", "float32", # 'float64', # "complex64", }
def __init__(self, interface): platforms = cl.get_platforms() # Initialize object attributes and retrieve command-line options...) self.device = None self.kernel = None self.interface = interface self.core = self.interface.addCore() self.defines = '' self.loopExponent = 0 # Set the initial number of nonces to run per execution # 2^(16 + aggression) self.AGGRESSION += 16 self.AGGRESSION = min(32, self.AGGRESSION) self.AGGRESSION = max(16, self.AGGRESSION) self.size = 1 << self.AGGRESSION # We need a QueueReader to efficiently provide our dedicated thread # with work. self.qr = QueueReader(self.core, lambda nr: self.preprocess(nr), lambda x, y: self.size * 1 << self.loopExponent) # The platform selection must be valid to mine. if self.PLATFORM >= len(platforms) or \ (self.PLATFORM is None and len(platforms) > 1): self.interface.log( 'Wrong platform or more than one OpenCL platform found, ' 'use PLATFORM=ID to select one of the following\n', False, True) for i, p in enumerate(platforms): self.interface.log(' [%d]\t%s' % (i, p.name), False, False) # Since the platform is invalid, we can't mine. self.interface.fatal() return elif self.PLATFORM is None: self.PLATFORM = 0 devices = platforms[self.PLATFORM].get_devices() # The device selection must be valid to mine. if self.DEVICE >= len(devices) or \ (self.DEVICE is None and len(devices) > 1): self.interface.log( 'No device specified or device not found, ' 'use DEVICE=ID to specify one of the following\n', False, True) for i, d in enumerate(devices): self.interface.log(' [%d]\t%s' % (i, d.name), False, False) # Since the device selection is invalid, we can't mine. self.interface.fatal() return elif self.DEVICE is None: self.DEVICE = 0 self.device = devices[self.DEVICE] # We need the appropriate kernel for this device... try: self.loadKernel(self.device) except Exception: self.interface.fatal('Failed to load OpenCL kernel!') return # Initialize a command queue to send commands to the device, and a # buffer to collect results in... self.commandQueue = cl.CommandQueue(self.context) self.output = np.zeros(self.OUTPUT_SIZE + 1, np.uint32) self.output_buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=self.output) self.applyMeta()
def threadedSimulation(time_delta, time_steps, objects, queue_data, queue_comm, skip_n, openCL=False, method="first order leapfrog"): """ Transforms the input to numpy types if the simulation is run on the CPU or to OpenCL types if OpenCL is being used. Then coditionally sets up an OpenCL environment and finally runs the siumlation. """ #transformation----------------------------------------------------- name = [""] * len(objects) mass = np.zeros(len(objects), dtype=np.float64) position_out = np.zeros( (int(time_steps) // int(skip_n) + 1, len(objects), 3), dtype=np.float64) if openCL == False: pos = np.zeros((len(objects), 3), dtype=np.float64) vel = np.zeros((len(objects), 3), dtype=np.float64) for elem in range(len(objects)): name[elem] = objects[elem].getName() mass[elem] = objects[elem].getMass() pos[elem] = np.array([ao.getAstrObjPos(objects[elem], objects)], dtype=np.float64) vel[elem] = np.array([ao.getAstrObjVel(objects[elem], objects)], dtype=np.float64) position_out[0] = np.array(pos, dtype=np.float64) else: pos = np.zeros((1, len(objects)), cl.array.vec.double4) vel = np.zeros((1, len(objects)), cl.array.vec.double4) force = np.zeros((1, len(objects)), cl.array.vec.double4) for elem in range(len(objects)): name[elem] = objects[elem].getName() mass[elem] = objects[elem].getMass() pos[0, elem] = tuple(ao.getAstrObjPos(objects[elem], objects)) + (0, ) vel[0, elem] = tuple(ao.getAstrObjVel(objects[elem], objects)) + (0, ) position_out[0] = np.array( [list(pos[0][i])[0:3] for i in range(len(pos[0]))], dtype=np.float64) #OpenCL initialization---------------------------------------------- if openCL == True: platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context([device]) clqueue = cl.CommandQueue(context) if method == "first order leapfrog": kernel = open("first_order_leapfrog.cl", 'r').read() program = cl.Program(context, kernel).build() program.kick.set_scalar_arg_dtypes( [None, None, None, np.int32, np.float32]) program.drift.set_scalar_arg_dtypes( [None, None, np.int32, np.float32]) elif method == "PEFRL": kernel = open("PEFRL.cl", 'r').read() program = cl.Program(context, kernel).build() program.PEFRL_1.set_scalar_arg_dtypes( [None, None, np.int32, np.float32]) program.PEFRL_2.set_scalar_arg_dtypes( [None, None, None, np.int32, np.float32]) program.PEFRL_3.set_scalar_arg_dtypes( [None, None, np.int32, np.float32]) program.PEFRL_4.set_scalar_arg_dtypes( [None, None, None, np.int32, np.float32]) program.PEFRL_5.set_scalar_arg_dtypes( [None, None, np.int32, np.float32]) mem_flags = cl.mem_flags buffer_position = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=pos) buffer_velocity = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=vel) buffer_mass = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=mass) #simulation---------------------------------------------------- if openCL == False: if method == "first order leapfrog": vel = ode.leapfrog_first_order_kick(pos, vel, mass, time_delta / 2) #initial phase shift j = 1 for i in range(time_steps): if i % 10 == 0: if not queue_comm.empty(): tmp = queue_comm.get() if tmp == "stop": break else: queue_comm.put(tmp) if method == "first order leapfrog": pos = ode.leapfrog_first_order_drift(pos, vel, time_delta) vel = ode.leapfrog_first_order_kick(pos, vel, mass, time_delta) elif method == "PEFRL": pos, vel = ode.PEFRL(pos, vel, mass, time_delta) #there should be a closed form covering both cases... if skip_n == 1: position_out[i + 1] = np.array(pos, dtype=np.float64) elif (i + 1) % skip_n == 0: position_out[j] = np.array(pos, dtype=np.float64) j += 1 if queue_comm.empty(): queue_comm.put(i / time_steps * 100) else: dim = np.int32(len(objects)) time_delta_CL = np.float32(time_delta) #send kernels to GPU-------------------------------------------------------------- if method == "first order leapfrog": kernel_kick_built = program.kick kernel_kick_built.set_args(buffer_mass, buffer_position, buffer_velocity, dim, np.float32(time_delta / 2)) #inital phase offset cl.enqueue_nd_range_kernel(clqueue, kernel_kick_built, vel.shape, None) kernel_drift_built = program.drift kernel_drift_built.set_args(buffer_position, buffer_velocity, dim, time_delta_CL) kernel_kick_built.set_args(buffer_mass, buffer_position, buffer_velocity, dim, time_delta_CL) elif method == "PEFRL": kernel_PEFRL1_built = program.PEFRL_1 kernel_PEFRL1_built.set_args(buffer_position, buffer_velocity, dim, time_delta_CL) kernel_PEFRL2_built = program.PEFRL_2 kernel_PEFRL2_built.set_args(buffer_mass, buffer_position, buffer_velocity, dim, time_delta_CL) kernel_PEFRL3_built = program.PEFRL_3 kernel_PEFRL3_built.set_args(buffer_position, buffer_velocity, dim, time_delta_CL) kernel_PEFRL4_built = program.PEFRL_4 kernel_PEFRL4_built.set_args(buffer_mass, buffer_position, buffer_velocity, dim, time_delta_CL) kernel_PEFRL5_built = program.PEFRL_5 kernel_PEFRL5_built.set_args(buffer_position, buffer_velocity, dim, time_delta_CL) #actual simulation loop----------------------------------------------------------- j = 1 for i in range(time_steps): if i % 10 == 0: if not queue_comm.empty(): tmp = queue_comm.get() if tmp == "stop": break else: queue_comm.put(tmp) if method == "first order leapfrog": cl.enqueue_nd_range_kernel(clqueue, kernel_drift_built, vel.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_kick_built, vel.shape, None) elif method == "PEFRL": cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL1_built, pos.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL2_built, vel.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL3_built, pos.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL4_built, vel.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL5_built, pos.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL4_built, vel.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL3_built, pos.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL2_built, vel.shape, None) cl.enqueue_nd_range_kernel(clqueue, kernel_PEFRL1_built, pos.shape, None) #prevents the queue from growing in some implementations of OpenCL if i % 100 == 0: clqueue.finish() if skip_n == 1: cl.enqueue_copy(clqueue, pos, buffer_position) position_out[i + 1] = np.array( [list(pos[0][k])[0:3] for k in range(len(pos[0]))], dtype=np.float64) elif (i + 1) % skip_n == 0: cl.enqueue_copy(clqueue, pos, buffer_position) position_out[j] = np.array( [list(pos[0][k])[0:3] for k in range(len(pos[0]))], dtype=np.float64) j += 1 if queue_comm.empty(): queue_comm.put(i / time_steps * 100) queue_data.put([name, [time_delta, skip_n], position_out])
def buildGraph(ip): """Builds the knn grap with intial params. params: ------ ip: initial params return: ------ graph: graph object of Graph """ # find the nearest neighbors on the gpu start = time() nbrs = NearestNeighbors(n_neighbors=ip.k+1, algorithm="buffer_kd_tree", tree_depth=9, plat_dev_ids={0:[0]}) nbrs.fit(ip.signal) dists, inds = nbrs.kneighbors(ip.signal) dists_gpu = dists dists_gpu = dists_gpu[0:,1:] dists_gpu = unroll(dists_gpu) dists_gpu = dists_gpu.astype('float32') ngbrs_gpu = inds ngbrs_gpu = ngbrs_gpu[0:,1:] ngbrs_gpu = unroll(ngbrs_gpu) ngbrs_gpu = ngbrs_gpu.astype('int32') k = ip.k scale = ip.sigma n, chnl = ip.signal.shape # now build the graph using those nns using gpu platform = cl.get_platforms()[0] print(platform) device = platform.get_devices()[0] print(device) context = cl.Context([device]) print(context) program = cl.Program(context, open(mywf).read()).build() print(program) queue = cl.CommandQueue(context) print(queue) # create the buffers on the device, intensity, nbgrs, weights mem_flags = cl.mem_flags dists_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,hostbuf=dists_gpu) weight_vec = np.ndarray(shape=(n*k,), dtype=np.float32) weight_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, weight_vec.nbytes) # run the kernel to compute the weights program.compute_weights(queue, (n*k,), None, dists_buf, weight_buf, np.int32(k), np.float32(scale)) queue.finish() # copy the weihts to the host memory cl.enqueue_copy(queue, weight_vec, weight_buf) queue.finish() end = time() - start print('total time taken by the gpu python:', end) # save the graph graph = Graph(weight_vec,ngbrs_gpu,k) return graph
print("[INFO]: implement = %s" % (args.type)) print("[INFO]: arch = %s" % (args.arch)) print("[INFO]: kernel @ %s" % (args.kernel)) print("[INFO]: repeat %d times" % (args.repeat)) print("[INFO]: transA = %s" % (args.transA)) print("[INFO]: transB = %s" % (args.transB)) print("[INFO]: m = %d" % (args.m)) print("[INFO]: n = %d" % (args.n)) print("[INFO]: k = %d" % (args.k)) print("[INFO]: alpha = %f" % (args.alpha)) print("[INFO]: beta = %f" % (args.beta)) print("[INFO]: verify = %s" % (args.verify)) # create platform, cq platforms = filter(lambda p: 'AMD' in p.name, cl.get_platforms()) devices = filter(lambda d: args.arch == d.name, platforms[0].get_devices()) assert len(devices) == 1 ctx = cl.Context(devices) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) hgemm = Hgemm(args.kernel, ctx, devices) hgemm.tune(args.m, args.n, args.k,\ args.m, args.n, args.m,\ args.alpha, args.beta,\ args.transA, args.transB,\ implement = args.type) #A = np.asfortranarray(np.random.rand(args.m, args.k).astype(np.float16)) #B = np.asfortranarray(np.random.rand(args.n, args.k).astype(np.float16)) #A = np.asfortranarray(np.tril(np.full((args.m, args.k),1.0)).astype(np.float16))
import numpy import pyopencl #@UnresolvedImport from pyopencl import mem_flags #@UnresolvedImport from xpra.util import engs from xpra.os_util import _memoryview PREFERRED_DEVICE_TYPE = os.environ.get("XPRA_OPENCL_DEVICE_TYPE", "GPU") PREFERRED_DEVICE_NAME = os.environ.get("XPRA_OPENCL_DEVICE_NAME", "") PREFERRED_DEVICE_PLATFORM = os.environ.get("XPRA_OPENCL_PLATFORM", "") OPENCL_YUV2RGB = os.environ.get("XPRA_OPENCL_YUV2RGB", "0")=="1" AMD_WARNING_SHOWN = not os.environ.get("XPRA_AMD_WARNING", "1")=="1" opencl_platforms = pyopencl.get_platforms() if len(opencl_platforms)==0: raise ImportError("no OpenCL platforms found!") def roundup(n, m): return (n + m - 1) & ~(m - 1) def dimdiv(dim, div): #when we divide a dimensions by the subsampling #we want to round up so as to include the last #pixel when we hit odd dimensions return roundup(dim//div, div) def device_type(d): try: return pyopencl.device_type.to_string(d.type)
def getParticleData(data, p): h = p["particles"] w = p["points_per_particle"] dim = 4 # four points: x, y, alpha, width # fade in and out fade_ms = p["fade_ms"] dur = p["duration_ms"] ms = p["ms"] fadeProgress = 1.0 if ms < fade_ms: fadeProgress = 1.0 * ms / fade_ms elif ms > (dur - fade_ms): fadeProgress = 1.0 - 1.0 * (ms - (dur - fade_ms)) / fade_ms if p["debug"]: fadeProgress = 1.0 offset = 1.0 - p["animationProgress"] tw = p["width"] th = p["height"] dh = len(data) dw = len(data[0]) result = np.zeros(tw * th, dtype=np.float32) # print "%s x %s x %s = %s" % (w, h, dim, len(result)) fData = np.array(data) fData = fData.astype(np.float32) fData = fData.reshape(-1) # print "%s x %s x 3 = %s" % (dw, dh, len(fData)) pData = np.array(p["particleProperties"]) pData = pData.astype(np.float32) pData = pData.reshape(-1) # print "%s x 3 = %s" % (h, len(pData)) # the kernel function src = """ static float lerp(float a, float b, float mu) { return (b - a) * mu + a; } static float det(float a0, float a1, float b0, float b1) { return a0 * b1 - a1 * b0; } static float2 lineIntersection(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3) { float xd0 = x0 - x1; float xd1 = x2 - x3; float yd0 = y0 - y1; float yd1 = y2 - y3; float div = det(xd0, xd1, yd0, yd1); float2 intersection; intersection.x = -1.0; intersection.y = -1.0; if (div != 0.0) { float d1 = det(x0, y0, x1, y1); float d2 = det(x2, y2, x3, y3); intersection.x = det(d1, d2, xd0, xd1) / div; intersection.y = det(d1, d2, yd0, yd1) / div; } return intersection; } static float norm(float value, float a, float b) { float n = (value - a) / (b - a); if (n > 1.0) { n = 1.0; } if (n < 0.0) { n = 0.0; } return n; } static float wrap(float value, float a, float b) { if (value < a) { value = b - (a - value); } else if (value > b) { value = a + (value - b); } return value; } void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness); void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha); void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness) { int dx = abs(x1-x0); int dy = abs(y1-y0); if (dx==0 && dy==0) { return; } // draw the first line drawSingleLine(p, x0, y0, x1, y1, w, h, alpha); thickness--; if (thickness < 1) return; int stepX = 0; int stepY = 0; if (dx > dy) stepY = 1; else stepX = 1; // loop through thickness int offset = 1; for (int i=0; i<thickness; i++) { int xd = stepX * offset; int yd = stepY * offset; drawSingleLine(p, x0+xd, y0+yd, x1+xd, y1+yd, w, h, alpha); // alternate above and below offset *= -1; if (offset > 0) { offset++; } } } void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha) { // clamp x0 = clamp(x0, 0, w-1); x1 = clamp(x1, 0, w-1); y0 = clamp(y0, 0, h-1); y1 = clamp(y1, 0, h-1); int dx = abs(x1-x0); int dy = abs(y1-y0); if (dx==0 && dy==0) { return; } int sy = 1; int sx = 1; if (y0>=y1) { sy = -1; } if (x0>=x1) { sx = -1; } int err = dx/2; if (dx<=dy) { err = -dy/2; } int e2 = err; int x = x0; int y = y0; for(int i=0; i<w; i++){ p[y*w+x] = alpha; if (x==x1 && y==y1) { break; } e2 = err; if (e2 >-dx) { err -= dy; x += sx; } if (e2 < dy) { err += dx; y += sy; } } } __kernel void getParticles(__global float *data, __global float *pData, __global float *result){ int points = %d; int dw = %d; int dh = %d; float tw = %f; float th = %f; float offset = %f; float magMin = %f; float magMax = %f; float alphaMin = %f; float alphaMax = %f; float velocityMult = %f; float fadeProgress = %f; float lineWidthMin = %f; float lineWidthMax = %f; float lineWidthLatMin = %f; float lineWidthLatMax = %f; // get current position int i = get_global_id(0); float dx = pData[i*3]; float dy = pData[i*3+1]; float doffset = pData[i*3+2]; // set starting position float x = dx * (tw-1); float y = dy * (th-1); for(int j=0; j<points; j++) { // get UV value int lon = (int) round(dx * (dw-1)); int lat = (int) round(dy * (dh-1)); int dindex = lat * dw * 3 + lon * 3; float u = data[dindex+1]; float v = data[dindex+2]; // check for invalid values if (u >= 999.0 || u <= -999.0) { u = 0.0; } if (v >= 999.0 || v <= -999.0) { v = 0.0; } // calc magnitude float mag = sqrt(u * u + v * v); mag = norm(mag, magMin, magMax); // determine alpha transparency based on magnitude and offset float jp = (float) j / (float) (points-1); float progressMultiplier = (jp + offset + doffset) - floor(jp + offset + doffset); float alpha = lerp(alphaMin, alphaMax, mag * progressMultiplier); float thickness = lerp(lineWidthMin, lineWidthMax, mag * progressMultiplier); // adjust thickness based on latitude float latMultiplier = (float) abs(lat - (dh/2)) / (float) (dh/2); float thicknessMultiplier = lerp(lineWidthLatMin, lineWidthLatMax, latMultiplier); thickness *= thicknessMultiplier; if (thickness < 1.0) thickness = 1.0; // we are fading in/out if (fadeProgress < 1.0) { alpha = alpha * fadeProgress; } float x1 = x + u * velocityMult; float y1 = y + (-v) * velocityMult; // clamp y if (y1 < 0.0) { y1 = 0.0; } if (y1 > (th-1.0)) { y1 = th-1.0; } // check for no movement if (x==x1 && y==y1) { break; // check for invisible line } else if (alpha < 1.0) { // continue // wrap from left to right } else if (x1 < 0) { float2 intersection = lineIntersection(x, y, x1, y1, (float) 0.0, (float) 0.0, (float) 0.0, th); if (intersection.y > 0.0) { drawLine(result, (int) round(x), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); drawLine(result, (int) round((float) (tw-1.0) + x1), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); } // wrap from right to left } else if (x1 > tw-1.0) { float2 intersection = lineIntersection(x, y, x1, y1, (float) (tw-1.0), (float) 0.0, (float) (tw-1.0), th); if (intersection.y > 0.0) { drawLine(result, (int) round(x), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); drawLine(result, (int) round((float) x1 - (float)(tw-1.0)), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness); } // draw it normally } else { drawLine(result, (int) round(x), (int) round(y), (int) round(x1), (int) round(y1), (int) tw, (int) th, round(alpha), (int) thickness); } // wrap x x1 = wrap(x1, 0.0, tw-1); dx = x1 / tw; dy = y1 / th; x = x1; y = y1; } } """ % (w, dw, dh, tw, th, offset, p["mag_range"][0], p["mag_range"][1], p["alpha_range"][0], p["alpha_range"][1], p["velocity_multiplier"], fadeProgress, p["linewidth_range"][0], p["linewidth_range"][1], p["linewidth_lat_range"][0], p["linewidth_lat_range"][1]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: # print "Using GPU" ctx = cl.Context(devices=GPUs) else: print "Warning: using CPU" ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=fData) inPData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pData) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) prg.getParticles(queue, (h, ), None, inData, inPData, outResult) # Copy result cl.enqueue_copy(queue, result, outResult) result = result.reshape((th, tw)) result = result.astype(np.uint8) return result
def get_all_cl_gpus(): gpu_list = [] for platf in cl.get_platforms(): gpu_list.extend(platf.get_devices(cl.device_type.GPU)) return gpu_list
def addParticlesToImage(baseImage, colorImage, particles, p): basePx = np.array(baseImage) basePx = basePx.astype(np.uint8) colorPx = np.array(colorImage) colorPx = colorPx.astype(np.uint8) shape = colorPx.shape h, w, dim = shape basePx = basePx.reshape(-1) colorPx = colorPx.reshape(-1) particles = particles.reshape(-1) # the kernel function src = """ __kernel void addParticles(__global uchar *base, __global uchar *colors, __global uchar *particles, __global uchar *result){ int w = %d; int dim = %d; float power = 1.0 - %f; // lower number = more visible lines int posx = get_global_id(1); int posy = get_global_id(0); int i = posy * w * dim + posx * dim; int j = posy * w + posx; float alpha = (float) particles[j] / 255.0; int r = colors[i]; int g = colors[i+1]; int b = colors[i+2]; if (alpha > 0) { alpha = pow(alpha*alpha + alpha*alpha, power); if (alpha > 1.0) { alpha = 1.0; } float inv = 1.0 - alpha; r = (int) round((r * alpha) + ((float) base[i] * inv)); g = (int) round((g * alpha) + ((float) base[i+1] * inv)); b = (int) round((b * alpha) + ((float) base[i+2] * inv)); } else { r = base[i]; g = base[i+1]; b = base[i+2]; } result[i] = r; result[i+1] = g; result[i+2] = b; } """ % (w, dim, p["line_visibility"]) # Get platforms, both CPU and GPU plat = cl.get_platforms() GPUs = plat[0].get_devices(device_type=cl.device_type.GPU) CPU = plat[0].get_devices() # prefer GPUs if GPUs and len(GPUs) > 0: ctx = cl.Context(devices=GPUs) else: print "Warning: using CPU" ctx = cl.Context(CPU) # Create queue for each kernel execution queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function instantiation prg = cl.Program(ctx, src).build() inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=basePx) inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=colorPx) inC = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=particles) outResult = cl.Buffer(ctx, mf.WRITE_ONLY, colorPx.nbytes) prg.addParticles(queue, [h, w], None, inA, inB, inC, outResult) # Copy result result = np.empty_like(colorPx) cl.enqueue_copy(queue, result, outResult) result = result.reshape(shape) return result
block = sixtracklib.cBlock.from_line(line) cbeam = bref.copy().reshape(-1)[:npart] st = time.time() block.track_cl(cbeam, nturn=nturn, turnbyturn=True) st = time.time() - st perfgpu = st / npart / nturn * 1e6 print("GPU part %4d, turn %4d: %10.3f usec/part*turn" % (npart, nturn, perfgpu)) block = sixtracklib.cBlock.from_line(line) npart2 = npart / 100 cbeam = bref.copy().reshape(-1)[:npart2] st = time.time() block.track(cbeam, nturn=nturn, turnbyturn=True) st = time.time() - st perfcpu = st / npart2 / nturn * 1e6 print("CPU part %4d, turn %4d: %10.3f usec/part*turn" % (npart2, nturn, perfcpu)) print("GPU/CPU : %g" % (perfcpu / perfgpu)) return st, npart, nturn, perfgpu, perfcpu out = open(time.strftime("bench_%Y%M%dT%H%m%S.txt"), 'w') out.write("#%s" % pyopencl.get_platforms()[0].get_devices()[0]) for npart in [100, 1000, 2000, 5000, 10000, 20000]: for nturn in [1, 2, 5, 10]: st, npart, nturn, perfgpu, perfcpu = mkbench(npart, nturn) fmt = "%5d %5d %10.3f %10.3f %10.3f\n" out.write(fmt % (npart, nturn, perfgpu, perfcpu, perfcpu / perfgpu))
import pyopencl as cl import numpy as np from timeit import timeit_repeat from pyopencl.algorithm import RadixSort from pyopencl.bitonic_sort import BitonicSort from pyopencl import clrandom from pyopencl.scan import GenericScanKernel device = cl.get_platforms()[1].get_devices()[0] ctx = cl.Context([device]) queue = cl.CommandQueue(ctx) reps = 64 @timeit_repeat(reps) def test_radix_speed(buff, sorter): sorter(buff)[1].wait() @timeit_repeat(reps) def test_bitonic_speed(buff, sorter): sorter(buff)[1].wait() @timeit_repeat(reps) def test_numpy_speed(buff): np.sort(buff) from collections import defaultdict
#!/usr/bin/env python """ Basic 2d histogram. """ import time import pyopencl as cl import pyopencl.array import numpy as np # Select the desired OpenCL platform; you shouldn't need to change this: NAME = 'NVIDIA CUDA' platforms = cl.get_platforms() devs = None for platform in platforms: if platform.name == NAME: devs = platform.get_devices() # Set up a command queue: ctx = cl.Context(devs) queue = cl.CommandQueue(ctx) # Compute histogram in Python: def hist(x): bins = np.zeros(256, np.uint32) for v in x.flat: bins[v] += 1 return bins
def __init__(self, descriptor, geometry, moments, collide, pop_eq_src='', boundary_src='', platform=0, precision='single', layout=None, padding=None, align=False, opengl=False): self.descriptor = descriptor self.geometry = geometry self.grid = Grid(self.geometry, padding) self.time = 0 self.float_type = { 'single': (numpy.float32, 'float'), 'double': (numpy.float64, 'double'), }.get(precision, None) self.mako_lookup = TemplateLookup(directories=[Path(__file__).parent]) self.platform = cl.get_platforms()[platform] if opengl: try: self.context = cl.Context( properties=[(cl.context_properties.PLATFORM, self.platform)] + get_gl_sharing_context_properties()) except: self.context = cl.Context( properties=[(cl.context_properties.PLATFORM, self.platform) ] + get_gl_sharing_context_properties(), devices=[self.platform.get_devices()[0]]) else: self.context = cl.Context( properties=[(cl.context_properties.PLATFORM, self.platform)]) self.queue = cl.CommandQueue(self.context) self.memory = Memory(self.descriptor, self.grid, self.context, self.float_type[0], align, opengl) self.tick = False self.moments = moments self.collide = collide self.pop_eq_src = pop_eq_src self.boundary_src = boundary_src self.layout = layout self.compiler_args = { 'single': '-cl-single-precision-constant -cl-fast-relaxed-math', 'double': '-cl-fast-relaxed-math' }.get(precision, None) self.build_kernel() self.program.equilibrilize(self.queue, self.grid.size(), self.layout, self.memory.cl_pop_a, self.memory.cl_pop_b).wait() self.material = numpy.ndarray(shape=(self.memory.volume, 1), dtype=numpy.int32)
def main3d(Run): tbegin = time.time() params = Run.params params.phi_step = np.array(params.phi_step, dtype=np.float32) ascii_gen_list = params.symmetry_operators ops_list = genlist2oplist(ascii_gen_list) apply_sym = 0 if len(ops_list) > 1: apply_sym = 1 number_of_run = params.number_of_run Bmatrix = Run.Bmatrix Bi = np.linalg.inv(Bmatrix) flist = Run.flist total = 0 for run in range(number_of_run): total += len(flist[run]) p = ProgressBar(total) Filter = fabio.open(params.maskFile).data.astype(np.float32) (dim1, dim2) = Filter.shape last_run = 0 if not Run.making_volume: Run.number_of_volume = int(1) Run.cube_dim = int(1) if not Run.making_shell: Run.number_of_shell = int(1) Run.shell_dim = int(1) if not Run.making_slice: Run.number_of_slice = int(1) Run.slice_dim = int(1) if not Run.making_pole_figure: Run.number_of_figure = int(1) Run.pole_size = int(1) #GPU gpu_enable = int(params.gpu_enable) if gpu_enable: platform = cl.get_platforms()[int(params.platform_id)] device = platform.get_devices()[int(params.device_id)] context = cl.Context([device]) queue = cl.CommandQueue(context) mf = cl.mem_flags kernel_code = open("kernelCode.cl", "r").read() kernel_pars = {"number_of_volume":Run.number_of_volume, \ "nx":Run.cube_dim, \ "ny":Run.cube_dim, \ "nz":Run.cube_dim, \ "dim1":dim1, \ "dim2":dim2, \ "dimsym":ops_list.shape[0],\ "number_of_shell": Run.number_of_shell,\ "sx": Run.shell_dim,\ "sy": Run.shell_dim,\ "sz": Run.shell_dim,\ "number_of_figure": Run.number_of_figure,\ "px": Run.pole_size,\ "py": Run.pole_size,\ "slice_size": Run.slice_dim } prog = cl.Program(context, kernel_code % kernel_pars).build() data_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Filter) Qfin_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.all_Q0[0]) Filter_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Filter) symOps_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ops_list) if Run.making_volume: Volume = np.zeros( (Run.number_of_volume, Run.cube_dim, Run.cube_dim, Run.cube_dim), dtype=np.float32) Mask = np.zeros( (Run.number_of_volume, Run.cube_dim, Run.cube_dim, Run.cube_dim), dtype=np.uint32) if gpu_enable: volCenter_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.volume_center) volExtent_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.volume_extent) Volume_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=Volume) Mask_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=Mask) if Run.making_shell: ShellVolume = np.zeros( (Run.number_of_shell, Run.shell_dim, Run.shell_dim, Run.shell_dim), dtype=np.float32) ShellMask = np.zeros( (Run.number_of_shell, Run.shell_dim, Run.shell_dim, Run.shell_dim), dtype=np.uint32) if gpu_enable: Q_shell_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.Q_shell) shell_center_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.shell_center) # shell_extent_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.shell_extent) shell_extent_gpu = np.float32(Run.shell_extent) # shell_thickness_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.shell_thickness) shell_thickness_gpu = np.float32(Run.shell_thickness) ShellVolume_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=ShellVolume) ShellMask_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=ShellMask) if Run.making_slice: SliceImage = np.zeros( (Run.number_of_slice, Run.slice_dim, Run.slice_dim), dtype=np.float32) SliceMask = np.zeros( (Run.number_of_slice, Run.slice_dim, Run.slice_dim), dtype=np.uint32) if gpu_enable: G_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.G) dQ0_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.dQ0) dQ1_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.dQ1) dQ2_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.dQ2) Qoff_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.Qoff) SliceImage_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=SliceImage) SliceMask_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=SliceMask) if Run.making_pole_figure: PoleData = np.zeros( (Run.number_of_figure, Run.pole_size, Run.pole_size), dtype=np.float32) PoleMask = np.zeros( (Run.number_of_figure, Run.pole_size, Run.pole_size), dtype=np.uint32) if gpu_enable: PoleData_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=PoleData) PoleMask_gpu = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=PoleMask) Qpole_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.Qpole) # pole_thickness_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=Run.pole_thickness) pole_thickness_gpu = np.float32(Run.pole_thickness) #GPU # sample_angles = np.zeros(int(Run.params.sample_circles)) # scanning_motor_index = Run.params.sample_axis.index(Run.params.scanning_motor) # sample_rotation_dir = list(Run.params.rot_dir) # print("Scanning motor: %s, index: %d"%(Run.params.scanning_motor, scanning_motor_index)) print("Scanning motor: ", Run.params.scanning_motor) for run in range(number_of_run): nbfile = 0 sample_angles = np.zeros(int(Run.params.sample_circles)) print("Scanning motor: %s" % (Run.params.scanning_motor[run])) scanning_motor_index = Run.params.sample_axis.index( Run.params.scanning_motor[run]) sample_rotation_dir = list(Run.params.rot_dir) print("Scanning motor: %s, index: %d" % (Run.params.scanning_motor[run], scanning_motor_index)) for id in range(len(flist[run])): data = Run.allData_allRun[run][id] motors = Run.allMotor_allRun[run][id] data = (data * Filter) / (Run.all_C3[run] * Run.all_POLA[run]) for sc in range(int(Run.params.sample_circles)): sample_angles[sc] = motors[Run.params.sample_axis[sc]] U = Run.Umatrix if gpu_enable: cl.enqueue_copy(queue, data_gpu, data).wait() for j in range(Run.interp_factor): interphi = sample_angles[ scanning_motor_index] + j / Run.interp_factor * params.phi_step[ run] sample_angles[scanning_motor_index] = interphi R = Sample_Rotation(sample_angles, sample_rotation_dir) Q = np.tensordot(Run.all_Q0[run], R.T, axes=([2], [1])) Qfin = np.tensordot(Q, U.T, axes=([2], [1])) if gpu_enable: cl.enqueue_copy(queue, Qfin_gpu, Qfin).wait() if Run.making_volume: if gpu_enable: prog.volReconstruction(queue, data.shape, None, volCenter_gpu, volExtent_gpu, Volume_gpu, Mask_gpu, Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu).wait() else: fillvolume.volume(Run.volume_center, Run.volume_extent, Volume, Mask, Qfin, data, Filter, apply_sym, ops_list) if Run.making_shell: if gpu_enable: prog.extract_shell( queue, data.shape, None, Q_shell_gpu, shell_center_gpu, shell_extent_gpu, shell_thickness_gpu, ShellVolume_gpu, ShellMask_gpu, Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu).wait() else: fillvolume.extract_shell(Run.Q_shell, Run.shell_center, Run.shell_extent, Run.shell_thickness, ShellVolume, ShellMask, Qfin, data, Filter, apply_sym, ops_list) if Run.making_slice: if gpu_enable: prog.extract_slice(queue, data.shape, None, np.int32(Run.number_of_slice), dQ0_gpu, dQ1_gpu, dQ2_gpu, Qoff_gpu, SliceImage_gpu, SliceMask_gpu,\ Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu, G_gpu).wait() else: fillvolume.extract_slice(Run.number_of_slice, Run.dQ0, Run.dQ1, Run.dQ2, Run.Qoff, SliceImage, SliceMask, Qfin, data, Filter, apply_sym, ops_list, Run.G) if Run.making_pole_figure: if gpu_enable: prog.stereo_projection(queue, data.shape, None, Qpole_gpu, pole_thickness_gpu, PoleData_gpu, PoleMask_gpu,\ Qfin_gpu, data_gpu, Filter_gpu, np.int32(apply_sym), symOps_gpu).wait() else: fillvolume.stereo_projection(Run.Qpole, Run.pole_thickness, PoleData, PoleMask, Qfin, data, Filter, apply_sym, ops_list) print('interpolation #%d on %d' % (j + 1, Run.interp_factor)) nbfile += 1 timeI2 = time.time() p.update_time(nbfile + last_run) print( '------------------------------------------------------------') print(p) print( '------------------------------------------------------------') print('\n') last_run += nbfile print('3D Intensity Distribution : Done') ################################## #GPU if gpu_enable: Qfin_gpu.release() data_gpu.release() Filter_gpu.release() symOps_gpu.release() #GPU if Run.making_volume: if gpu_enable: # Getting data from gpu back cl.enqueue_copy(queue, Volume, Volume_gpu).wait() cl.enqueue_copy(queue, Mask, Mask_gpu).wait() Volume_gpu.release() Mask_gpu.release() volExtent_gpu.release() volCenter_gpu.release() for v in range(Run.number_of_volume): filter_ids = np.where(Mask[v] != 0) Volume[v][filter_ids] = Volume[v][filter_ids] / Mask[v][filter_ids] save_cmap(Run.volumeName[v], Volume[v]) if Run.making_shell: if gpu_enable: cl.enqueue_copy(queue, ShellVolume, ShellVolume_gpu).wait() cl.enqueue_copy(queue, ShellMask, ShellMask_gpu).wait() ShellVolume_gpu.release() ShellMask_gpu.release() Q_shell_gpu.release() shell_center_gpu.release() # shell_extent_gpu.release() # shell_thickness_gpu.release() for sh in range(Run.number_of_shell): filter_ids = np.where(ShellMask[sh] != 0) ShellVolume[sh][filter_ids] = ShellVolume[sh][ filter_ids] / ShellMask[sh][filter_ids] save_cmap(Run.shellName[sh], ShellVolume[sh]) if Run.making_slice: if gpu_enable: cl.enqueue_copy(queue, SliceImage, SliceImage_gpu).wait() cl.enqueue_copy(queue, SliceMask, SliceMask_gpu).wait() SliceImage_gpu.release() SliceMask_gpu.release() for s in range(Run.number_of_slice): mapout = np.zeros_like(SliceImage[s]) mapout[np.where(SliceMask[s] != 0)] = SliceImage[s][np.where( SliceMask[s] != 0)] / SliceMask[s][np.where(SliceMask[s] != 0)] tmp2 = mapout * params.scale_factor wi = fabio.cbfimage.cbfimage(data=tmp2.astype(np.int32)) mapOutName = params.slice_outname[s] wi.write(mapOutName) Qoutname = mapOutName.split(".")[0] + "_hkl.h5" print("Slice %s saved." % mapOutName) Qoffset = np.dot(Run.Qoff[s], Run.G[s]) x = np.linspace(-Run.dQ1[s], Run.dQ1[s], Run.slice_dim) y = np.linspace(-Run.dQ2[s], Run.dQ2[s], Run.slice_dim) x, y = np.meshgrid(x, y) z = np.zeros(x.shape) q = np.zeros((Run.slice_dim, Run.slice_dim, 3)) q[:, :, 0] = x + Qoffset[0] q[:, :, 1] = y + Qoffset[1] q[:, :, 2] = z + Qoffset[2] Gi = np.linalg.inv(Run.G[s]) Qn = np.tensordot(q, Gi, axes=([2], [1])) HKL = np.tensordot(Qn, Bi, axes=([2], [1])) h5file = h5py.File(Qoutname, "w") h5file.create_dataset("/Q", data=HKL, compression='gzip', compression_opts=9) h5file.create_dataset("/data", data=tmp2, compression='gzip', compression_opts=9) h5file.close() print("HKL coordinates saved.") rsmViewer_fn = mapOutName.split(".")[0] + "_rsmviewer.h5" # save2RSMviewer(tmp2, HKL, rsmViewer_fn) if Run.making_pole_figure: if gpu_enable: cl.enqueue_copy(queue, PoleData, PoleData_gpu).wait() cl.enqueue_copy(queue, PoleMask, PoleMask_gpu).wait() PoleData_gpu.release() PoleMask_gpu.release() for p in range(Run.number_of_figure): mapout = np.zeros_like(PoleData[p]) mapout[np.where(PoleMask[p] != 0)] = PoleData[p][np.where( PoleMask[p] != 0)] / PoleMask[p][np.where(PoleMask[p] != 0)] tmp = mapout * params.scale_factor wi = fabio.cbfimage.cbfimage(data=tmp.astype(np.int32)) mapOutName = Run.pole_name[p] wi.write(mapOutName) print("Pole %s saved." % mapOutName) ################################### print('Normal END') gc.collect() tend = time.time() print("Total time for this operation: %.3f s" % (tend - tbegin))
# data points must be a multiple of workers a = numpy.random.rand(data_points).astype(numpy.float32) b = numpy.random.rand(data_points).astype(numpy.float32) c_result = numpy.empty_like(a) # Speed in normal CPU usage time1 = time() c_temp = (a + b) # adds each element in a to its corresponding element in b c_temp = c_temp * c_temp # element-wise multiplication c_result = c_temp * (a / 2.0) # element-wise half a and multiply time2 = time() print("Execution time of test without OpenCL: ", time2 - time1, "s") for platform in cl.get_platforms(): for device in platform.get_devices(): print( "===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print( "---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size // 1024 // 1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) print("Device max work group size:", device.max_work_group_size)
def __init__(self, platform, salt, iter, debug, N=0, r=0, p=0, length=0x20): if type(salt) != bytes: assert ("Parameter salt has to be type of bytes") if type(iter) != int: assert ("Parameter Iteration has to be type of int") platforms = cl.get_platforms() if (platform > len(platforms)): assert ("Selected platform %d doesn't exist" % platform) saltlen = int(len(salt)) if (saltlen > int(64)): print('Salt longer than 64 chars is not supported!') exit(0) hash = b'\x00' * 64 hash_len = 64 n_salt = np.fromstring(salt, dtype=np.uint32) n_saltlen = np.array([len(salt)], dtype=np.uint32) self.n_iter = np.array(iter, dtype=np.uint32) self.salt = np.append(n_saltlen, n_salt) self.N = N #np.array(N, dtype=np.uint32) self.r = r #np.array(r, dtype=np.uint32) self.p = p #np.array(p, dtype=np.uint32) # Get platforms devices = platforms[platform].get_devices() self.workgroupsize = 60000 #Create context for GPU/CPU print("Using Platform %d:" % platform) self.ctx = cl.Context(devices) for device in devices: print( '--------------------------------------------------------------------------' ) print(' Device - Name: ' + device.name) print(' Device - Type: ' + cl.device_type.to_string(device.type)) print(' Device - Compute Units: {0}'.format( device.max_compute_units)) print(' Device - Max Work Group Size: {0:.0f}'.format( device.max_work_group_size)) if (device.max_work_group_size < self.workgroupsize): self.workgroupsize = device.max_work_group_size print("\nUsing work group size of %d\n" % self.workgroupsize) # Create queue for each kernel execution self.queue = cl.CommandQueue(self.ctx) # Kernel function src = "" if (debug): os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' src = """ typedef struct { unsigned int length; unsigned int buffer[32/4]; } inbuf; typedef struct { unsigned int buffer[32/4]; } outbuf; static void pbkdf(__global const unsigned int *pass, int pass_len, const unsigned int *salt, int salt_len, int iter, unsigned int* hash, unsigned int hash_len) { hash[0]=pass_len; hash[1]=pass[0]; hash[2]=hash_len; hash[3]=iter; hash[4]=salt_len; hash[5]=salt[0]; } __kernel void func_pbkdf2(__global const inbuf * inbuffer, __global outbuf * outbuffer, __global const inbuf * salt, const int iterations) { unsigned int idx = get_global_id(0); unsigned int hash[32/4]={0}; unsigned int ssalt[32/4]={0}; ssalt[0]=salt[0].buffer[0]; ssalt[1]=salt[0].buffer[1]; ssalt[2]=salt[0].buffer[2]; ssalt[3]=salt[0].buffer[3]; ssalt[4]=salt[0].buffer[4]; ssalt[5]=salt[0].buffer[5]; ssalt[6]=salt[0].buffer[6]; ssalt[7]=salt[0].buffer[7]; int salt_len=salt[0].length; pbkdf(inbuffer[idx].buffer, inbuffer[idx].length, ssalt, salt_len, iterations, hash,32); outbuffer[idx].buffer[0]=hash[0]; outbuffer[idx].buffer[1]=hash[1]; outbuffer[idx].buffer[2]=hash[2]; outbuffer[idx].buffer[3]=hash[3]; outbuffer[idx].buffer[4]=hash[4]; outbuffer[idx].buffer[5]=hash[5]; outbuffer[idx].buffer[6]=hash[6]; outbuffer[idx].buffer[7]=hash[7]; } """ else: os.environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
def set_bfast_parameters(self, start_monitor, end_monitor, start_hist, freq, k, hfrac, trend, level, backend='opencl', verbose=0, device_id=0): '''Set parameters, see bfast for what they do.. okay we should say this here parameters: ----------- start_monitor : datetime object A datetime object specifying the start of the monitoring phase. end_monitor: datetime object A datetime object specifying the end of the monitoring phase. start_hist: datetime object A datetime object specifying the start of the history phase. freq : int, default 365 The frequency for the seasonal model. k : int, default 3 The number of harmonic terms. hfrac : float, default 0.25 Float in the interval (0,1) specifying the bandwidth relative to the sample size in the MOSUM/ME monitoring processes. trend : bool, default True Whether a tend offset term shall be used or not level : float, default 0.05 Significance level of the monitoring (and ROC, if selected) procedure, i.e., probability of type I error. backend : string, either 'opencl' or 'python' Chooses what backend to use. opencl uses the GPU implementation, which is much faster. verbose : int, optional (default=0) The verbosity level (0=no output, 1=output) ''' self.start_monitor = start_monitor self.end_monitor = end_monitor self.start_hist = start_hist self.freq = freq self.k = k self.hfrac = hfrac self.trend = trend self.level = level self.backend = backend self.verbose = verbose self.device_id = device_id self.model = bfast.BFASTMonitor( self.start_monitor, freq=freq, # add these k=k, hfrac=hfrac, trend=trend, level=level, backend=backend, verbose=verbose, device_id=device_id, ) try: print("device: ", pyopencl.get_platforms()[0].get_devices()) except: print( "You selected openCL, but no device was found, are you sure you set up a gpu session?" )
def __init__(self, batchSize, maxT, maxC, kernelVariant=1, enableGPUDebug=False): "specify size: number of batch elements, number of time-steps, number of characters. Set kernelVariant to either 1 or 2. Set enableGPUDebug to True to debug kernel via CodeXL." # force rebuild of program such that GPU debugger can attach to kernel self.enableGPUDebug = enableGPUDebug if enableGPUDebug: os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' os.environ['PYOPENCL_NO_CACHE'] = '1' #consts self.batchSize = batchSize self.maxT = maxT self.maxC = maxC assert kernelVariant in [1, 2] self.kernelVariant = kernelVariant # platform, context, queue platforms = cl.get_platforms() assert platforms self.platform = platforms[0] # take first platform devices = self.platform.get_devices(cl.device_type.GPU) # get GPU devices assert devices self.device = devices[0] # take first GPU self.context = cl.Context([self.device]) # context contains the first GPU self.queue = cl.CommandQueue(self.context, self.device) # command queue to first GPU # buffer sizeOfFloat32 = 4 batchBufSize = batchSize * maxC * maxT * sizeOfFloat32 self.batchBuf = cl.Buffer(self.context, cl.mem_flags.READ_ONLY, size=batchBufSize, hostbuf=None) self.res = np.zeros([batchSize, maxT]).astype(np.int32) self.resBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes) self.tmpBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes) # compile program and use defines for program-constants to avoid passing private variables buildOptions = '-D STEP_BEGIN={} -D MAX_T={} -D MAX_C={}'.format(2 ** math.ceil(math.log2(maxT)), maxT, maxC) self.program = cl.Program(self.context, open('BestPathCL.cl').read()).build(buildOptions) # variant 1: single pass if kernelVariant == 1: self.kernel1 = cl.Kernel(self.program, 'bestPathAndCollapse') self.kernel1.set_arg(0, self.batchBuf) self.kernel1.set_arg(1, self.resBuf) # all time-steps must fit into a work-group assert maxT <= self.kernel1.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) # variant 2: two passes else: # kernel1: calculate best path self.kernel1 = cl.Kernel(self.program, 'bestPath') self.kernel1.set_arg(0, self.batchBuf) self.kernel1.set_arg(1, self.tmpBuf) # kernel2: collapse best path self.kernel2 = cl.Kernel(self.program, 'collapsePath') self.kernel2.set_arg(0, self.tmpBuf) self.kernel2.set_arg(1, self.resBuf) # all chars must fit into a work-group assert maxC <= self.kernel1.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)
def printplatforms(self): i = 0 for platform in cl.get_platforms(): print('Platform %d - Name %s, Vendor %s' % (i, platform.name, platform.vendor)) i += 1
import pyopencl as cl import numpy as np from pyopencl import cltypes import os from nncl import nn, losses from nncl.layers import layer if __name__ == "__main__": ctx = cl.Context([cl.get_platforms()[1].get_devices()[0]]) queue = cl.CommandQueue(ctx) net = nn.Network(ctx) iris = np.loadtxt(os.path.dirname(os.path.realpath(__file__)) + '/../data/iris.csv', skiprows=1, delimiter=',', dtype=cltypes.float) np.random.seed(420) np.random.shuffle(iris, ) x = iris[:, :-1] y = iris[:, -1:] # convert y to sparse categorical, # ie. row with class 1 will have [0,1,0] # 3 output classes # sparse_y = np.zeros((x.shape[0], 3)) # for idx, c in enumerate(iris[:, -1]): # sparse_y[idx, int(c)] = 1 split_idx = int(0.33 * x.shape[0]) x_train = x[:split_idx] y_train = y[:split_idx] x_test = x[split_idx:] y_test = y[split_idx:]
cfl = .5 time_max = .214 p4p1 = 10. # pressure ratio r4r1 = 8. # density ratio gamma = 1.4 # ratio of sepcific heat x = mesh(xmin, xmax, imax) dx = x[1] - x[0] # initial condition u = ic(imax) t = 0. platform = cl.get_platforms()[0] device = platform.get_devices()[1] ctx = cl.Context([device]) #ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags start_time = time.time() while (t < time_max): # time step dt = step() # solver lax()
import numpy as np import os from matplotlib import pyplot as plt import cv2 as cv import random import pickle import sys import logging import time import datetime import pyscreenshot as ImageGrab import ctypes import pyopencl as cl # (1) setup OpenCL platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context print("Platform: {} Device:{}".format(platform, device)) # Parse the screen size user32 = ctypes.windll.user32 screensize = user32.GetSystemMetrics(0), user32.GetSystemMetrics(1) # Initialize the parameters confThreshold = 0.20 #Confidence threshold nmsThreshold = 0.40 #Non-maximum suppression threshold
result = run_simulation(TaskGenerator(dt=0.1), CudaSolver(), True, 16, 1000, 20) fits = plot_and_fit(result) q = fits.plot() check(TaskGenerator(), CudaSolver(), True)[0] test_fits(CudaSolver) """# OpenCL""" import pyopencl as cl # Import the OpenCL GPU computing API import pyopencl.array as cl_array print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices') for platform in cl.get_platforms(): # Print each platform on this computer print('=' * 60) print('Platform - Name: ' + platform.name) print('Platform - Vendor: ' + platform.vendor) print('Platform - Version: ' + platform.version) print('Platform - Profile: ' + platform.profile) for device in platform.get_devices(): # Print each device per-platform print(' ' + '-' * 56) print(' Device - Name: ' + device.name) print(' Device - Type: ' + cl.device_type.to_string(device.type)) print(' Device - Max Clock Speed: {0} Mhz'.format(device.max_clock_frequency)) print(' Device - Compute Units: {0}'.format(device.max_compute_units)) print(' Device - Local Memory: {0:.0f} KB'.format(device.local_mem_size/1024)) print(' Device - Constant Memory: {0:.0f} KB'.format(device.max_constant_buffer_size/1024)) print(' Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size/1073741824.0)) print('\n')