def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices=devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking=blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking=blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking=blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking=blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking=blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for=[in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def get_queue(cls, device=get_gpu()): if device.value in cls.queues: return cls.queues[device.value] else: ctx = pycl.clCreateContext(devices=[device]) queue = pycl.clCreateCommandQueue(context=ctx, device=device) cls.queues[device.value] = queue return queue
def get_context_and_queue_from_devices(devices): key = tuple(device.vendor_id for device in devices) try: return devices_context_queue_map[key] except KeyError: context = pycl.clCreateContext(devices) queue = pycl.clCreateCommandQueue(context) devices_context_queue_map[key] = (context, queue) return devices_context_queue_map[key]
def __init__(self): """__init__ Creates a context and queue that can be reused across calls to this function. """ devices = cl.clGetDeviceIDs() self.device = devices[-1] self.context = cl.clCreateContext([self.device]) self.queue = cl.clCreateCommandQueue(self.context)
def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices = devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def ocl_init( ocl_src ): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError( "no GPU openCL device found" ) assert use_devices is not None print( "OpenCL use_devices: " + str(use_devices) ) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource( context, ocl_src ).build() print prog #run_mxplusb( prog, queue ) run_conv( prog, queue )
def ocl_init(ocl_src): platforms = cl.clGetPlatformIDs() use_devices = None for platform in platforms: try: devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU) use_devices = devices[0:1] # arbitraily choose first device except cl.DeviceNotFoundError: pass if use_devices is not None: break if use_devices is None: raise ValueError("no GPU openCL device found") assert use_devices is not None print ("OpenCL use_devices: " + str(use_devices)) context = cl.clCreateContext(use_devices) queue = cl.clCreateCommandQueue(context) prog = cl.clCreateProgramWithSource(context, ocl_src).build() print prog # run_mxplusb( prog, queue ) run_conv(prog, queue)
def __init__(self, array, output): self.device = clGetDeviceIDs()[-1] self.context = clCreateContext([self.device]) self.queue = clCreateCommandQueue(self.context) self.array = array self.output = output
def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue( context=self.context ) #, properties=cl.cl_command_queue_properties.CL_QUEUE_PROFILING_ENABLE) self.device = self.queue.device
def get_unique_kernel_name(): global count count += 1 return "fn{}".format(count) if backend in {"ocl", "opencl", "OCL"}: try: # platforms = cl.clGetPlatformIDs() # devices = cl.clGetDeviceIDs(platforms[1]) devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU) except cl.DeviceNotFoundError: devices = cl.clGetDeviceIDs() context = cl.clCreateContext(devices[-1:]) if os.environ.get("TRAVIS"): queues = [cl.clCreateCommandQueue(context)] else: queues = [cl.clCreateCommandQueue(context) for _ in range(8)] # queues = [ # cl.clCreateCommandQueue( # context, # properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE # ) for _ in range(10) # ] queue = queues[0] hm_dir = os.path.join(tempfile.gettempdir(), "hindemith") if not os.path.exists(hm_dir): os.mkdir(hm_dir) unique_file_id = -1
def get_unique_kernel_name(): global count count += 1 return "fn{}".format(count) if backend in {"ocl", "opencl", "OCL"}: try: # platforms = cl.clGetPlatformIDs() # devices = cl.clGetDeviceIDs(platforms[1]) devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU) except cl.DeviceNotFoundError: devices = cl.clGetDeviceIDs() context = cl.clCreateContext(devices[-1:]) if os.environ.get("TRAVIS"): queues = [cl.clCreateCommandQueue(context)] else: queues = [ cl.clCreateCommandQueue( context ) for _ in range(8) ] # queues = [ # cl.clCreateCommandQueue( # context, # properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE # ) for _ in range(10) # ] queue = queues[0] hm_dir = os.path.join(tempfile.gettempdir(), "hindemith")
def __init__(self): self.device = clGetDeviceIDs()[-1] self.context = clCreateContext([self.device]) self.queue = clCreateCommandQueue(self.context)
def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU)
def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context)
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ arg_config, tuner_config = program_config dot = DotWriter() # hack yo ComputedVector._next_id = 0 CopiedVector._next_id = 0 # set up OpenCL context and memory spaces import pycl context = pycl.clCreateContextFromType(pycl.CL_DEVICE_TYPE_ALL) queues = [pycl.clCreateCommandQueue(context, dev) for dev in context.devices] c_func = ElementwiseFunction(context, queues) memories = [MainMemory()] + [OclMemory(q) for q in queues] main_memory = memories[0] ptrs = arg_config['ptrs'] dtype, length = ptrs[0]._dtype_, arg_config['len'] # pull stuff out of autotuner distribute_directives = tuner_config['distribute'] reassoc_directives = tuner_config['reassociate'] locs = [memories[loc] for loc in tuner_config['locs']] fusion_directives = tuner_config['fusion'] parallelize_directives = tuner_config['parallelize'] dot.write(py_ast) # run basic conversions proj = PyBasicConversions().visit(py_ast) dot.write(proj) # run platform-independent transformations proj = ApplyDistributiveProperty(distribute_directives).visit(proj) dot.write(proj) proj = ApplyAssociativeProperty(reassoc_directives).visit(proj) dot.write(proj) # set parameter types proj = VectorFinder(ptrs, main_memory).visit(proj) dot.write(proj) proj = LocationTagger(locs).visit(proj) dot.write(proj) proj = InsertIntermediates(main_memory).visit(proj) dot.write(proj) proj = CopyInserter(main_memory).visit(proj) dot.write(proj) proj = DoFusion(fusion_directives).visit(proj) dot.write(proj) proj = AllocateIntermediates(dtype, length).visit(proj) dot.write(proj, "postintermed") py_op = proj.find(FunctionDecl, name="py_op") schedules = FindParallelism(parallelize_directives).visit(py_op) py_op.defn = parallelize_tasks(schedules) dot.write(proj, "postparallel") proj = KernelOutliner(length).visit(proj) dot.write(proj) proj = LowerKernelCalls().visit(proj) dot.write(proj) proj = RefConverter().visit(proj) dot.write(proj) proj = LowerLoopsAndCopies(length).visit(proj) dot.write(proj) zipper = ArgZipper() proj = zipper.visit( Lifter().visit(proj) ) c_func.extra_args = zipper.extra_args c_func.answer = zipper.answer dot.write(proj) fn = proj.find(FunctionDecl) return c_func.finalize("py_op", proj, fn.get_type())
prefetch_option = os.getenv("LATTE_PREFETCH_MODE", "ON") unroll_options = ["ON", "OFF"] unroll_option = os.getenv("LATTE_UNROLL", "ON") if parallel_strategy not in parallel_strategies: logger.warn("Invalid parallel strategy [%s], defaulting to OPENMP", parallel_strategy) parallel_strategy = "OPENMP" nthreads = os.getenv("LATTE_NUM_THREADS", None) if parallel_strategy == "OPENCL_SIMPLE_LOOP": import pycl as cl cl_ctx = cl.clCreateContext() cl_queue = cl.clCreateCommandQueue(cl_ctx) elif parallel_strategy in ["SIMPLE_LOOP" ] or parallel_strategy in ["FLOWGRAPH_LOOP"]: package_path = os.path.dirname(os.path.abspath(__file__)) _file = FileTemplate( os.path.dirname(os.path.abspath(__file__)) + "/runtime/runtime.cpp", {"LATTE_PACKAGE_PATH": StringTemplate(package_path)}) c_file = C.CFile("runtime", [_file]) module = util.mpi_compile(ctree.nodes.Project([c_file])) init_nthreads = module.get_callable("init_nthreads", ctypes.CFUNCTYPE(None, ctypes.c_int)) init_default = module.get_callable("init_default", ctypes.CFUNCTYPE(None)) if nthreads is not None: init_nthreads(int(nthreads)) else: