def make_codepy_module(self, toolchain, dtype): from codepy.libraries import add_codepy toolchain = toolchain.copy() add_codepy(toolchain) from cgen import (Value, Include, Statement, Typedef, FunctionBody, FunctionDeclaration, Block, Const, Line, POD, Initializer, CustomLoop) S = Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("vector"), Include("algorithm"), Include("hedge/base.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), Include("boost/numeric/ublas/io.hpp"), ]+self.get_cpu_extra_includes()) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Line(), ]) mod.add_function(FunctionBody( FunctionDeclaration(Value("void", "process_elements"), [ Const(Value("uniform_element_ranges", "ers")), Const(Value("numpy_vector<value_type>", "field")), Value("numpy_vector<value_type>", "result"), ]+self.get_cpu_extra_parameter_declarators()), Block([ Typedef(Value("numpy_vector<value_type>::iterator", "it_type")), Typedef(Value("numpy_vector<value_type>::const_iterator", "cit_type")), Line(), Initializer(Value("it_type", "result_it"), "result.begin()"), Initializer(Value("cit_type", "field_it"), "field.begin()"), Line() ]+self.get_cpu_extra_preamble()+[ Line(), CustomLoop( "BOOST_FOREACH(const element_range er, ers)", Block(self.get_cpu_per_element_code()) ) ]))) #print mod.generate() #toolchain = toolchain.copy() #toolchain.enable_debugging return mod.compile(toolchain)
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value("numpy_array<%s >::iterator" % dtype_to_ctype(varg.dtype), varg.name), "args.%s_ary.begin()" % varg.name) for varg in arguments if isinstance(varg, VectorArg)] +[Initializer( sarg.declarator(), "args." + sarg.name) for sarg in arguments if isinstance(sarg, ScalarArg)] ) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)]) ) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration( Value("void", name), [POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args")]), body)) return mod
def make_greet_mod(greeting): from cgen import FunctionBody, FunctionDeclaration, Block, \ Const, Pointer, Value, Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( FunctionBody( FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []), Block([Statement('return "%s"' % greeting)]))) from codepy.toolchain import guess_toolchain return mod.compile(guess_toolchain(), wait_on_error=True)
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement # noqa: N806 mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value( "numpy_array<{} >::iterator".format(dtype_to_ctype( varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()") for varg in arguments if isinstance(varg, VectorArg) ] + [ Initializer(sarg.declarator(), f"args.{sarg.name}") for sarg in arguments if isinstance(sarg, ScalarArg) ]) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)])) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration(Value("void", name), [ POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args") ]), body)) return mod
def make_greet_mod(greeting): from codepy.cgen import FunctionBody, FunctionDeclaration, Block, \ Const, Pointer, Value, Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( FunctionBody( FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []), Block([Statement('return "%s"' % greeting)]) )) from codepy.toolchain import guess_toolchain return mod.compile(guess_toolchain(), wait_on_error=True)
import cgen as c from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( c.FunctionBody( c.FunctionDeclaration(c.Const(c.Pointer(c.Value("char", "greet"))), []), c.Block([c.Statement('return "hello world"')]))) from codepy.toolchain import guess_toolchain cmod = mod.compile(guess_toolchain()) print(cmod.greet())
host_statements = [ #Extract information from PyCUDA GPUArray #Get length 'tuple shape = extract<tuple>(gpu_array.attr("shape"))', 'int length = extract<int>(shape[0])', #Get data pointer 'CUdeviceptr ptr = extract<CUdeviceptr>(gpu_array.attr("gpudata"))', #Call Thrust routine, compiled into the CudaModule 'my_sort(ptr, length)', #Return result 'return gpu_array', ] host_mod.add_function( FunctionBody( FunctionDeclaration(Value('object', 'host_entry'), [Value('object', 'gpu_array')]), Block([Statement(x) for x in host_statements]))) #Print out generated code, to see what we're actually compiling print("---------------------- Host code ----------------------") print(host_mod.generate()) print("--------------------- Device code ---------------------") print(nvcc_mod.generate()) print("-------------------------------------------------------") #Compile modules import codepy.jit, codepy.toolchain gcc_toolchain = codepy.toolchain.guess_toolchain()
'PyObject* length = PySequence_GetItem(shape, 0)', 'int intLength = boost::python::extract<int>(length)', # Call CUDA function 'CUdeviceptr diffResult = diffInstance(cudaPointer, intLength)', # Build resulting GPUArray 'PyObject* args = Py_BuildValue("()")', 'PyObject* newShape = Py_BuildValue("(i)", intLength)', 'PyObject* kwargs = Py_BuildValue("{sOsOs%s}", "shape", newShape, "dtype", type, "gpudata", diffResult)' % ptr_sz_uint_conv, 'PyObject* GPUArrayClass = PyObject_GetAttrString(gpuArray, "__class__")', 'PyObject* remoteResult = PyObject_Call(GPUArrayClass, args, kwargs)', 'return remoteResult'] host_mod.add_function( FunctionBody( FunctionDeclaration(Pointer(Value("PyObject", "adjacentDifference")), [Pointer(Value("PyObject", "gpuArray"))]), Block([Statement(x) for x in statements]))) host_mod.add_to_preamble([Include('boost/python/extract.hpp')]) cuda_mod = CudaModule(host_mod) cuda_mod.add_to_preamble([Include('cuda.h')]) globalIndex = 'int index = blockIdx.x * blockDim.x + threadIdx.x' compute_diff = 'outputPtr[index] = inputPtr[index] - inputPtr[index-1]' launch = ['CUdeviceptr output', 'cuMemAlloc(&output, sizeof(T) * length)', 'int bSize = 256', 'int gSize = (length-1)/bSize + 1', 'diffKernel<<<gSize, bSize>>>((T*)inputPtr, length, (T*)output)',
def get_boundary_flux_mod(fluxes, fvi, discr, dtype): from cgen import \ FunctionDeclaration, FunctionBody, Typedef, Struct, \ Const, Reference, Value, POD, MaybeUnused, \ Statement, Include, Line, Block, Initializer, Assign, \ CustomLoop, For from pytools import to_uncomplex_dtype, flatten from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("cstdlib"), Include("algorithm"), Line(), Include("boost/foreach.hpp"), Line(), Include("hedge/face_operators.hpp"), ]) S = Statement mod.add_to_module([ S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) arg_struct = Struct("arg_struct", [ Value("numpy_array<value_type>", "flux%d_on_faces" % i) for i in range(len(fluxes)) ]+[ Value("numpy_array<value_type>", arg_name) for arg_name in fvi.arg_names ]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) fdecl = FunctionDeclaration( Value("void", "gather_flux"), [ Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))), Reference(Value("arg_struct", "args")) ]) from pymbolic.mapper.stringifier import PREC_PRODUCT def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[loc_fof_base+i]" % flux_idx, "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) ] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result fbody = Block([ Initializer( Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)), "args.flux%d_on_faces.begin()" % i) for i in range(len(fluxes)) ]+[ Initializer( Const(Value("numpy_array<value_type>::const_iterator", "%s_it" % arg_name)), "args.%s.begin()" % arg_name) for arg_name in fvi.arg_names ]+[ Line(), CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block( list(flatten([ Initializer(Value("node_number_t", "%s_ebi" % where), "fp.%s.el_base_index" % where), Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where), "fg.index_list(fp.%s.face_index_list_number)" % where), Line(), ] for where in ["int_side", "ext_side"] ))+[ Line(), Initializer(Value("node_number_t", "loc_fof_base"), "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count" " + fp.%(where)s.face_id)" % {"where": "int_side"}), Line(), For( "unsigned i = 0", "i < fg.face_length()", "++i", Block( [ Initializer(MaybeUnused( Value("node_number_t", "%s_idx" % where)), "%(where)s_ebi + %(where)s_idx_list[i]" % {"where": where}) for where in ["int_side", "ext_side"] ]+gen_flux_code() ) ) ])) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input("[Enter]") return mod.compile(get_flux_toolchain(discr, fluxes))
from codepy.cgen import * from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( FunctionBody( FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []), Block([Statement('return "hello world"')]) )) from codepy.toolchain import guess_distutils_toolchain cmod = mod.compile(guess_distutils_toolchain()) print cmod.greet()
def make_lift(self, fgroup, with_scale, dtype): discr = self.discr from cgen import ( FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() S = Statement mod.add_to_preamble([ Include("hedge/face_operators.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()), Define("FACES_PER_EL", fgroup.ldis_loc.face_count()), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) def if_(cond, result, else_=None): if cond: return [result] else: if else_ is None: return [] else: return [else_] fdecl = FunctionDeclaration( Value("void", "lift"), [ Const(Reference(Value("face_group<face_pair<straight_face> >", "fg"))), Value("ublas::matrix<uncomplex_type>", "matrix"), Value("numpy_array<value_type>", "field"), Value("numpy_array<value_type>", "result") ]+if_(with_scale, Const(Reference(Value("numpy_array<double>", "elwise_post_scaling")))) ) def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"), "%s.begin()" % name) fbody = Block([ make_it("field"), make_it("result", is_const=False), ]+if_(with_scale, make_it("elwise_post_scaling", tpname="double"))+[ Line(), For("unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()", "++fg_el_nr", Block([ Initializer( Value("node_number_t", "dest_el_base"), "fg.local_el_write_base[fg_el_nr]"), Initializer( Value("node_number_t", "src_el_base"), "FACES_PER_EL*fg.face_length()*fg_el_nr"), Line(), For("unsigned i = 0", "i < DOFS_PER_EL", "++i", Block([ Initializer(Value("value_type", "tmp"), 0), Line(), For("unsigned j = 0", "j < FACES_PER_EL*fg.face_length()", "++j", S("tmp += matrix(i, j)*field_it[src_el_base+j]") ), Line(), ]+if_(with_scale, Assign("result_it[dest_el_base+i]", "tmp * value_type(*elwise_post_scaling_it)"), Assign("result_it[dest_el_base+i]", "tmp")) ) ), ]+if_(with_scale, S("elwise_post_scaling_it++")) ) ) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print FunctionBody(fdecl, fbody) #raw_input() return mod.compile(self.discr.toolchain).lift
Block([Statement(x) for x in [ #Extract information from PyCUDA GPUArray #Get length 'p::tuple sorted_shape = p::extract<p::tuple>(sorted_gpu_array.attr("shape"))', 'int sorted_length = p::extract<int>(sorted_shape[0])', 'p::tuple bounds_shape = p::extract<p::tuple>(bounds_gpu_array.attr("shape"))', 'int bounds_length = p::extract<int>(bounds_shape[0])', #Get data pointer 'CUdeviceptr sorted_ptr = p::extract<CUdeviceptr>(sorted_gpu_array.attr("ptr"))', 'CUdeviceptr bounds_ptr = p::extract<CUdeviceptr>(bounds_gpu_array.attr("ptr"))', 'CUdeviceptr output_ptr = p::extract<CUdeviceptr>(output_gpu_array.attr("ptr"))', #Call Thrust routine, compiled into the CudaModule 'thrust_upper_bound_int((int*) sorted_ptr, sorted_length, ' '(int*) bounds_ptr, bounds_length, (int*) output_ptr)', ] ])), ] for fct in host_functions: host_mod.add_function(fct) gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() # COMPILED CODE: '''Compiled thrust functionality, use this module to access thrust functions. ''' compiled_module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=False)
def _cusp_solver(M, parameters): cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p[ 'ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor']) module = _cusp_cache.get(cache_key(M.ctype, parameters)) if module: return module import codepy.toolchain from cgen import FunctionBody, FunctionDeclaration from cgen import Block, Statement, Include, Value from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() if 'CUSP_HOME' in os.environ: nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], []) host_mod = BoostPythonModule() nvcc_mod = CudaModule(host_mod) nvcc_includes = [ 'thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h', 'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h', 'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h', 'cusp/precond/ainv.h', 'string' ] nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes]) nvcc_mod.add_to_preamble([Statement('using namespace std')]) # We're translating PETSc preconditioner types to CUSP diag = Statement( 'cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)') ainv = Statement( 'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)' ) amg = Statement( 'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)' ) none = Statement( 'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)' ) preconditioners = { 'diagonal': diag, 'jacobi': diag, 'ainv': ainv, 'ainvcusp': ainv, 'amg': amg, 'hypre': amg, 'none': none, None: none } try: precond_call = preconditioners[parameters['pc_type']] except KeyError: raise RuntimeError("Cusp does not support preconditioner type %s" % parameters['pc_type']) solvers = { 'cg': Statement('cusp::krylov::cg(A, x, b, monitor, M)'), 'bicgstab': Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'), 'gmres': Statement( 'cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' % parameters) } try: solve_call = solvers[parameters['ksp_type']] except KeyError: raise RuntimeError("Cusp does not support solver type %s" % parameters['ksp_type']) monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters nvcc_function = FunctionBody( FunctionDeclaration(Value('void', '__cusp_solve'), [ Value('CUdeviceptr', '_rowptr'), Value('CUdeviceptr', '_colidx'), Value('CUdeviceptr', '_csrdata'), Value('CUdeviceptr', '_b'), Value('CUdeviceptr', '_x'), Value('int', 'nrows'), Value('int', 'ncols'), Value('int', 'nnz') ]), Block([ Statement('typedef int IndexType'), Statement('typedef %s ValueType' % M.ctype), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices' ), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values' ), Statement( 'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix' ), Statement( 'thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)' ), Statement( 'thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)' ), Statement( 'thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)' ), Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'), Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'), Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'), Statement('indices column_indices(colidx, colidx + nnz)'), Statement('values matrix_values(csrdata, csrdata + nnz)'), Statement('values b(d_b, d_b + nrows)'), Statement('values x(d_x, d_x + ncols)'), Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'), Statement( 'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)' ), Statement('cusp::%s_monitor< ValueType > %s' % ('verbose' if parameters['ksp_monitor'] else 'default', monitor)), precond_call, solve_call ])) host_mod.add_to_preamble( [Include('boost/python/extract.hpp'), Include('string')]) host_mod.add_to_preamble([Statement('using namespace boost::python')]) host_mod.add_to_preamble([Statement('using namespace std')]) nvcc_mod.add_function(nvcc_function) host_mod.add_function( FunctionBody( FunctionDeclaration(Value('void', 'solve'), [ Value('object', '_rowptr'), Value('object', '_colidx'), Value('object', '_csrdata'), Value('object', '_b'), Value('object', '_x'), Value('object', '_nrows'), Value('object', '_ncols'), Value('object', '_nnz') ]), Block([ Statement( 'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))' ), Statement( 'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))' ), Statement( 'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))' ), Statement( 'CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))' ), Statement( 'CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))' ), Statement('int nrows = extract<int>(_nrows)'), Statement('int ncols = extract<int>(_ncols)'), Statement('int nnz = extract<int>(_nnz)'), Statement( '__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)' ) ]))) nvcc_toolchain.cflags.append('-arch') nvcc_toolchain.cflags.append('sm_20') nvcc_toolchain.cflags.append('-O3') module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=configuration["debug"]) _cusp_cache[cache_key(M.ctype, parameters)] = module return module
# Call CUDA function 'CUdeviceptr diffResult = diffInstance(cudaPointer, intLength)', # Build resulting GPUArray 'PyObject* args = Py_BuildValue("()")', 'PyObject* newShape = Py_BuildValue("(i)", intLength)', 'PyObject* kwargs = Py_BuildValue(' '"{sOsOs%s}", "shape", newShape, "dtype", type, "gpudata", diffResult)' % ptr_sz_uint_conv, 'PyObject* GPUArrayClass = PyObject_GetAttrString(gpuArray, "__class__")', 'PyObject* remoteResult = PyObject_Call(GPUArrayClass, args, kwargs)', 'return remoteResult' ] host_mod.add_function( c.FunctionBody( c.FunctionDeclaration( c.Pointer(c.Value("PyObject", "adjacentDifference")), [c.Pointer(c.Value("PyObject", "gpuArray"))]), c.Block([c.Statement(x) for x in statements]))) host_mod.add_to_preamble([c.Include('boost/python/extract.hpp')]) cuda_mod = CudaModule(host_mod) cuda_mod.add_to_preamble([c.Include('cuda.h')]) global_index = 'int index = blockIdx.x * blockDim.x + threadIdx.x' compute_diff = 'outputPtr[index] = inputPtr[index] - inputPtr[index-1]' launch = [ 'CUdeviceptr output', 'cuMemAlloc(&output, sizeof(T) * length)', 'int bSize = 256', 'int gSize = (length-1)/bSize + 1', 'diffKernel<<<gSize, bSize>>>((T*)inputPtr, length, (T*)output)', 'return output' ]
# Build resulting GPUArray 'PyObject* args = Py_BuildValue("()")', 'PyObject* newShape = Py_BuildValue("(i)", intLength)', 'PyObject* kwargs = Py_BuildValue(' '"{sOsOs%s}", "shape", newShape, "dtype", type, "gpudata", diffResult)' % ptr_sz_uint_conv, 'PyObject* GPUArrayClass = PyObject_GetAttrString(gpuArray, "__class__")', 'PyObject* remoteResult = PyObject_Call(GPUArrayClass, args, kwargs)', 'return remoteResult'] host_mod.add_function( c.FunctionBody( c.FunctionDeclaration(c.Pointer(c.Value("PyObject", "adjacentDifference")), [c.Pointer(c.Value("PyObject", "gpuArray"))]), c.Block([c.Statement(x) for x in statements]))) host_mod.add_to_preamble([c.Include('boost/python/extract.hpp')]) cuda_mod = CudaModule(host_mod) cuda_mod.add_to_preamble([c.Include('cuda.h')]) globalIndex = 'int index = blockIdx.x * blockDim.x + threadIdx.x' compute_diff = 'outputPtr[index] = inputPtr[index] - inputPtr[index-1]' launch = ['CUdeviceptr output', 'cuMemAlloc(&output, sizeof(T) * length)', 'int bSize = 256', 'int gSize = (length-1)/bSize + 1', 'diffKernel<<<gSize, bSize>>>((T*)inputPtr, length, (T*)output)',
def make_lift(self, fgroup, with_scale, dtype): discr = self.discr from cgen import (FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() S = Statement mod.add_to_preamble([ Include("hedge/face_operators.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()), Define("FACES_PER_EL", fgroup.ldis_loc.face_count()), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) def if_(cond, result, else_=None): if cond: return [result] else: if else_ is None: return [] else: return [else_] fdecl = FunctionDeclaration(Value("void", "lift"), [ Const( Reference(Value("face_group<face_pair<straight_face> >", "fg"))), Value("ublas::matrix<uncomplex_type>", "matrix"), Value("numpy_array<value_type>", "field"), Value("numpy_array<value_type>", "result") ] + if_( with_scale, Const( Reference(Value("numpy_array<double>", "elwise_post_scaling"))))) def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name + "_it"), "%s.begin()" % name) fbody = Block([ make_it("field"), make_it("result", is_const=False), ] + if_(with_scale, make_it("elwise_post_scaling", tpname="double")) + [ Line(), For( "unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()", "++fg_el_nr", Block([ Initializer(Value("node_number_t", "dest_el_base"), "fg.local_el_write_base[fg_el_nr]"), Initializer(Value("node_number_t", "src_el_base"), "FACES_PER_EL*fg.face_length()*fg_el_nr"), Line(), For( "unsigned i = 0", "i < DOFS_PER_EL", "++i", Block([ Initializer(Value("value_type", "tmp"), 0), Line(), For( "unsigned j = 0", "j < FACES_PER_EL*fg.face_length()", "++j", S("tmp += matrix(i, j)*field_it[src_el_base+j]" )), Line(), ] + if_( with_scale, Assign( "result_it[dest_el_base+i]", "tmp * value_type(*elwise_post_scaling_it)"), Assign("result_it[dest_el_base+i]", "tmp")))), ] + if_(with_scale, S("elwise_post_scaling_it++")))) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print FunctionBody(fdecl, fbody) #raw_input() return mod.compile(self.discr.toolchain).lift
host_statements = [ #Extract information from PyCUDA GPUArray #Get length 'tuple shape = extract<tuple>(gpu_array.attr("shape"))', 'int length = extract<int>(shape[0])', #Get data pointer 'CUdeviceptr ptr = extract<CUdeviceptr>(gpu_array.attr("ptr"))', #Call Thrust routine, compiled into the CudaModule 'my_sort(ptr, length)', #Return result 'return gpu_array', ] host_mod.add_function( FunctionBody( FunctionDeclaration(Value('object', 'host_entry'), [Value('object', 'gpu_array')]), Block([Statement(x) for x in host_statements]))) #Print out generated code, to see what we're actually compiling print("---------------------- Host code ----------------------") print((host_mod.generate())) print("--------------------- Device code ---------------------") print((nvcc_mod.generate())) print("-------------------------------------------------------") #Compile modules import codepy.jit, codepy.toolchain gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()
def make_diff(self, elgroup, dtype, shape): """ :param shape: If non-square, the resulting code takes two element_ranges arguments and supports non-square matrices. """ from hedge._internal import UniformElementRanges assert isinstance(elgroup.ranges, UniformElementRanges) ldis = elgroup.local_discretization discr = self.discr from cgen import ( FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define, ) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() # {{{ preamble S = Statement mod.add_to_preamble([Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp")]) mod.add_to_module( [ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("ROW_COUNT", shape[0]), Define("COL_COUNT", shape[1]), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ] ) fdecl = FunctionDeclaration( Value("void", "diff"), [ Const(Reference(Value("uniform_element_ranges", "from_ers"))), Const(Reference(Value("uniform_element_ranges", "to_ers"))), Value("numpy_array<value_type>", "field"), ] + [Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst) for rst in range(discr.dimensions)] + [Value("numpy_array<value_type>", "result%d" % i) for i in range(discr.dimensions)], ) # }}} # {{{ set-up def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name + "_it"), "%s.begin()" % name ) fbody = Block( [ If("ROW_COUNT != diffmat_rst%d.size1()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ] + [ If("COL_COUNT != diffmat_rst%d.size2()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ] + [ If("ROW_COUNT != to_ers.el_size()", S('throw(std::runtime_error("unsupported image element size"))')), If( "COL_COUNT != from_ers.el_size()", S('throw(std::runtime_error("unsupported preimage element size"))'), ), If( "from_ers.size() != to_ers.size()", S( 'throw(std::runtime_error("image and preimage element groups ' 'do nothave the same element count"))' ), ), Line(), make_it("field"), ] + [make_it("result%d" % i, is_const=False) for i in range(discr.dimensions)] + [ Line(), # }}} # {{{ computation For( "element_number_t eg_el_nr = 0", "eg_el_nr < to_ers.size()", "++eg_el_nr", Block( [ Initializer( Value("node_number_t", "from_el_base"), "from_ers.start() + eg_el_nr*COL_COUNT" ), Initializer(Value("node_number_t", "to_el_base"), "to_ers.start() + eg_el_nr*ROW_COUNT"), Line(), For( "unsigned i = 0", "i < ROW_COUNT", "++i", Block( [ Initializer(Value("value_type", "drst_%d" % rst), 0) for rst in range(discr.dimensions) ] + [Line()] + [ For( "unsigned j = 0", "j < COL_COUNT", "++j", Block( [ S( "drst_%(rst)d += " "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]" % {"rst": rst} ) for rst in range(discr.dimensions) ] ), ), Line(), ] + [ Assign("result%d_it[to_el_base+i]" % rst, "drst_%d" % rst) for rst in range(discr.dimensions) ] ), ), ] ), ), ] ) # }}} # {{{ compilation mod.add_function(FunctionBody(fdecl, fbody)) # print "----------------------------------------------------------------" # print mod.generate() # raw_input() compiled_func = mod.compile(self.discr.toolchain).diff if self.discr.instrumented: from hedge.tools import time_count_flop compiled_func = time_count_flop( compiled_func, discr.diff_timer, discr.diff_counter, discr.diff_flop_counter, flops=discr.dimensions * ( 2 * ldis.node_count() * len(elgroup.members) * ldis.node_count() # mul+add + 2 * discr.dimensions * len(elgroup.members) * ldis.node_count() ), increment=discr.dimensions, ) return compiled_func
from cgen import * from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( FunctionBody( FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []), Block([Statement('return "hello world"')]))) from codepy.toolchain import guess_toolchain cmod = mod.compile(guess_toolchain()) print cmod.greet()
import cgen as c from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( c.FunctionBody( c.FunctionDeclaration(c.Const(c.Pointer(c.Value("char", "greet"))), []), c.Block([c.Statement('return "hello world"')]) )) from codepy.toolchain import guess_toolchain cmod = mod.compile(guess_toolchain()) print(cmod.greet())
def _cusp_solver(M, parameters): cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p['ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor']) module = _cusp_cache.get(cache_key(M.ctype, parameters)) if module: return module import codepy.toolchain from cgen import FunctionBody, FunctionDeclaration from cgen import Block, Statement, Include, Value from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() if 'CUSP_HOME' in os.environ: nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], []) host_mod = BoostPythonModule() nvcc_mod = CudaModule(host_mod) nvcc_includes = ['thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h', 'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h', 'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h', 'cusp/precond/ainv.h', 'string'] nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes]) nvcc_mod.add_to_preamble([Statement('using namespace std')]) # We're translating PETSc preconditioner types to CUSP diag = Statement('cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)') ainv = Statement( 'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)') amg = Statement( 'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)') none = Statement( 'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)') preconditioners = { 'diagonal': diag, 'jacobi': diag, 'ainv': ainv, 'ainvcusp': ainv, 'amg': amg, 'hypre': amg, 'none': none, None: none } try: precond_call = preconditioners[parameters['pc_type']] except KeyError: raise RuntimeError("Cusp does not support preconditioner type %s" % parameters['pc_type']) solvers = { 'cg': Statement('cusp::krylov::cg(A, x, b, monitor, M)'), 'bicgstab': Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'), 'gmres': Statement('cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' % parameters) } try: solve_call = solvers[parameters['ksp_type']] except KeyError: raise RuntimeError("Cusp does not support solver type %s" % parameters['ksp_type']) monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters nvcc_function = FunctionBody( FunctionDeclaration(Value('void', '__cusp_solve'), [Value('CUdeviceptr', '_rowptr'), Value('CUdeviceptr', '_colidx'), Value('CUdeviceptr', '_csrdata'), Value('CUdeviceptr', '_b'), Value('CUdeviceptr', '_x'), Value('int', 'nrows'), Value('int', 'ncols'), Value('int', 'nnz')]), Block([ Statement('typedef int IndexType'), Statement('typedef %s ValueType' % M.ctype), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices'), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values'), Statement( 'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix'), Statement('thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)'), Statement('thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)'), Statement('thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)'), Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'), Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'), Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'), Statement('indices column_indices(colidx, colidx + nnz)'), Statement('values matrix_values(csrdata, csrdata + nnz)'), Statement('values b(d_b, d_b + nrows)'), Statement('values x(d_x, d_x + ncols)'), Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'), Statement( 'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)'), Statement('cusp::%s_monitor< ValueType > %s' % ('verbose' if parameters['ksp_monitor'] else 'default', monitor)), precond_call, solve_call ])) host_mod.add_to_preamble([Include('boost/python/extract.hpp'), Include('string')]) host_mod.add_to_preamble([Statement('using namespace boost::python')]) host_mod.add_to_preamble([Statement('using namespace std')]) nvcc_mod.add_function(nvcc_function) host_mod.add_function( FunctionBody( FunctionDeclaration(Value('void', 'solve'), [Value('object', '_rowptr'), Value('object', '_colidx'), Value('object', '_csrdata'), Value('object', '_b'), Value('object', '_x'), Value('object', '_nrows'), Value('object', '_ncols'), Value('object', '_nnz')]), Block([ Statement( 'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))'), Statement( 'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))'), Statement( 'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))'), Statement('CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))'), Statement('CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))'), Statement('int nrows = extract<int>(_nrows)'), Statement('int ncols = extract<int>(_ncols)'), Statement('int nnz = extract<int>(_nnz)'), Statement('__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)') ]))) nvcc_toolchain.cflags.append('-arch') nvcc_toolchain.cflags.append('sm_20') nvcc_toolchain.cflags.append('-O3') module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=configuration["debug"]) _cusp_cache[cache_key(M.ctype, parameters)] = module return module
def make_diff(self, elgroup, dtype, shape): """ :param shape: If non-square, the resulting code takes two element_ranges arguments and supports non-square matrices. """ from hedge._internal import UniformElementRanges assert isinstance(elgroup.ranges, UniformElementRanges) ldis = elgroup.local_discretization discr = self.discr from cgen import ( FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() # {{{ preamble S = Statement mod.add_to_preamble([ Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("ROW_COUNT", shape[0]), Define("COL_COUNT", shape[1]), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) fdecl = FunctionDeclaration( Value("void", "diff"), [ Const(Reference(Value("uniform_element_ranges", "from_ers"))), Const(Reference(Value("uniform_element_ranges", "to_ers"))), Value("numpy_array<value_type>", "field") ]+[ Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst) for rst in range(discr.dimensions) ]+[ Value("numpy_array<value_type>", "result%d" % i) for i in range(discr.dimensions) ] ) # }}} # {{{ set-up def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"), "%s.begin()" % name) fbody = Block([ If("ROW_COUNT != diffmat_rst%d.size1()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ] + [ If("COL_COUNT != diffmat_rst%d.size2()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ]+[ If("ROW_COUNT != to_ers.el_size()", S('throw(std::runtime_error("unsupported image element size"))')), If("COL_COUNT != from_ers.el_size()", S('throw(std::runtime_error("unsupported preimage element size"))')), If("from_ers.size() != to_ers.size()", S('throw(std::runtime_error("image and preimage element groups ' 'do nothave the same element count"))')), Line(), make_it("field"), ]+[ make_it("result%d" % i, is_const=False) for i in range(discr.dimensions) ]+[ Line(), # }}} # {{{ computation For("element_number_t eg_el_nr = 0", "eg_el_nr < to_ers.size()", "++eg_el_nr", Block([ Initializer( Value("node_number_t", "from_el_base"), "from_ers.start() + eg_el_nr*COL_COUNT"), Initializer( Value("node_number_t", "to_el_base"), "to_ers.start() + eg_el_nr*ROW_COUNT"), Line(), For("unsigned i = 0", "i < ROW_COUNT", "++i", Block([ Initializer(Value("value_type", "drst_%d" % rst), 0) for rst in range(discr.dimensions) ]+[ Line(), ]+[ For("unsigned j = 0", "j < COL_COUNT", "++j", Block([ S("drst_%(rst)d += " "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]" % {"rst":rst}) for rst in range(discr.dimensions) ]) ), Line(), ]+[ Assign("result%d_it[to_el_base+i]" % rst, "drst_%d" % rst) for rst in range(discr.dimensions) ]) ) ]) ) ]) # }}} # {{{ compilation mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input() compiled_func = mod.compile(self.discr.toolchain).diff if self.discr.instrumented: from hedge.tools import time_count_flop compiled_func = time_count_flop(compiled_func, discr.diff_timer, discr.diff_counter, discr.diff_flop_counter, flops=discr.dimensions*( 2 # mul+add * ldis.node_count() * len(elgroup.members) * ldis.node_count() + 2 * discr.dimensions * len(elgroup.members) * ldis.node_count()), increment=discr.dimensions) return compiled_func