def compile_model(context, source, dtype, fast=False): # type: (cl.Context, str, np.dtype, bool) -> cl.Program """ Build a model to run on the gpu. Returns the compiled program and its type. Raises an error if the desired precision is not available. """ dtype = np.dtype(dtype) if not all(has_type(d, dtype) for d in context.devices): raise RuntimeError("%s not supported for devices"%dtype) source_list = [generate.convert_type(source, dtype)] if dtype == generate.F16: source_list.insert(0, _F16_PRAGMA) elif dtype == generate.F64: source_list.insert(0, _F64_PRAGMA) # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. if context.devices[0].type == cl.device_type.GPU: source_list.insert(0, "#define USE_SINCOS\n") options = (get_fast_inaccurate_build_options(context.devices[0]) if fast else []) source = "\n".join(source_list) program = cl.Program(context, source).build(options=options) #print("done with "+program) return program
def compile_model(context, source, dtype, fast=False): # type: (cl.Context, str, np.dtype, bool) -> cl.Program """ Build a model to run on the gpu. Returns the compiled program and its type. The returned type will be float32 even if the desired type is float64 if any of the devices in the context do not support the cl_khr_fp64 extension. """ dtype = np.dtype(dtype) if not all(has_type(d, dtype) for d in context.devices): raise RuntimeError("%s not supported for devices" % dtype) source_list = [generate.convert_type(source, dtype)] if dtype == generate.F16: source_list.insert(0, _F16_PRAGMA) elif dtype == generate.F64: source_list.insert(0, _F64_PRAGMA) # Note: USE_SINCOS makes the intel cpu slower under opencl if context.devices[0].type == cl.device_type.GPU: source_list.insert(0, "#define USE_SINCOS\n") options = (get_fast_inaccurate_build_options(context.devices[0]) if fast else []) source = "\n".join(source_list) program = cl.Program(context, source).build(options=options) #print("done with "+program) return program
def compile_model(context, source, dtype, fast=False): # type: (cl.Context, str, np.dtype, bool) -> cl.Program """ Build a model to run on the gpu. Returns the compiled program and its type. The returned type will be float32 even if the desired type is float64 if any of the devices in the context do not support the cl_khr_fp64 extension. """ dtype = np.dtype(dtype) if not all(has_type(d, dtype) for d in context.devices): raise RuntimeError("%s not supported for devices"%dtype) source_list = [generate.convert_type(source, dtype)] if dtype == generate.F16: source_list.insert(0, _F16_PRAGMA) elif dtype == generate.F64: source_list.insert(0, _F64_PRAGMA) # Note: USE_SINCOS makes the intel cpu slower under opencl if context.devices[0].type == cl.device_type.GPU: source_list.insert(0, "#define USE_SINCOS\n") options = (get_fast_inaccurate_build_options(context.devices[0]) if fast else []) source = "\n".join(source_list) program = cl.Program(context, source).build(options=options) #print("done with "+program) return program
cl_elementwise = opencl_profile.cl_elementwise cl_scan = opencl_profile.cl_scan cl_random = opencl_profile.cl_random profile_properties = int(cl.command_queue_properties.PROFILING_ENABLE) import pyopencl.characterize as clchar; platform = cl.get_platforms()[openc_platform_index] device = platform.get_devices()[openc_device_index]; print "Selecting platform: %s. device: %s" % (platform,device) _ctx = cl.Context(platform.get_devices()); fast_clargs = clchar.get_fast_inaccurate_build_options(device) work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES); def get_a_context(cpu=False): if cpu: return cl.Context(cl.get_platforms()[1].get_devices()) return _ctx; _inkernal_reduction_template = Template(""" if ($blockSize >= 1024) { if ($offset < 512) { $sdata[$offset] += $sdata[$offset + 512*($stepsize)]; }
cl_reduction = opencl_profile.cl_reduction cl_elementwise = opencl_profile.cl_elementwise cl_scan = opencl_profile.cl_scan cl_random = opencl_profile.cl_random profile_properties = int(cl.command_queue_properties.PROFILING_ENABLE) import pyopencl.characterize as clchar platform = cl.get_platforms()[openc_platform_index] device = platform.get_devices()[openc_device_index] print "Selecting platform: %s. device: %s" % (platform, device) _ctx = cl.Context(platform.get_devices()) fast_clargs = clchar.get_fast_inaccurate_build_options(device) work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES) def get_a_context(cpu=False): if cpu: return cl.Context(cl.get_platforms()[1].get_devices()) return _ctx _inkernal_reduction_template = Template(""" if ($blockSize >= 1024) {
def __init__(self, queue, cl_discr_info, dtype=np.float64, profile=False): context = queue.context discr = self.discr = cl_discr_info.discr self.cl_discr_info = cl_discr_info self.profile = profile import pyopencl as cl import pyopencl.array # noqa dtype4 = cl.array.vec.types[np.dtype(dtype), 4] ldis = discr.ldis from pyopencl.characterize import get_fast_inaccurate_build_options build_options = get_fast_inaccurate_build_options(context.devices[0]) # {{{ volume kernel import loopy as lp volume_kernel = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ <> du_drst = sum(m, DrDsDt[n,m]*u[k,m]) <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m]) <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m]) <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m]) rhsu[k,n] = - dot(drst_dx[k],dp_drst) rhsv[k,n] = - dot(drst_dy[k],dp_drst) rhsw[k,n] = - dot(drst_dz[k],dp_drst) rhsp[k,n] = - (dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \ + dot(drst_dz[k], dw_drst)) """, [ lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="F"), "...", ], name="dg_volume", assumptions="K>=1", defines=dict(Np=discr.ldis.Np), options=dict(no_numpy=True, cl_build_options=build_options)) def transform_vol(knl): knl = lp.tag_inames(knl, dict(n="l.0", k="g.0")) #knl = lp.change_arg_to_image(knl, "DrDsDt") # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name) for name in ["drst_dx", "drst_dy", "drst_dz"]: knl = lp.add_prefetch(knl, "%s" % name) knl = lp.add_prefetch(knl, "DrDsDt") return knl self.volume_kernel = transform_vol(volume_kernel) self.volume_flops = discr.K * ( ( 4 # num components * 3 *discr.ldis.Np**2*2 ) + ( (3*2-1)*discr.ldis.Np * 6 ) + 2) self.volume_bytes = np.dtype(dtype).itemsize * discr.K * ( ( 4 # num components * 2 # load, store * discr.ldis.Np ) + # matrix 3 * discr.ldis.Np**2 + # geometric factors 6*4) # }}} # {{{ surface kernel NfpNfaces = ldis.Nfaces*ldis.Nfp surface_kernel = lp.make_kernel( "{[m,mp,n,k]: 0<= m,mp < NfpNfaces and 0<= n < Np and 0<= k < K }", """ <> idP = vmapP[k,m] <> idM = vmapM[k,m] <> du = u[[idP]]-u[[idM]] <> dv = v[[idP]]-v[[idM]] <> dw = w[[idP]]-w[[idM]] <> dp = bc[k,m]*p[[idP]] - p[[idM]] <> dQ = 0.5*Fscale[k,m]* \ (dp - nx[k,m]*du - ny[k,m]*dv - nz[k,m]*dw) <> fluxu[m] = -nx[k,m]*dQ <> fluxv[m] = -ny[k,m]*dQ <> fluxw[m] = -nz[k,m]*dQ <> fluxp[m] = dQ # reduction here rhsu[k,n] = rhsu[k,n] + sum(mp, LIFT[n,mp]*fluxu[mp]) rhsv[k,n] = rhsv[k,n] + sum(mp, LIFT[n,mp]*fluxv[mp]) rhsw[k,n] = rhsw[k,n] + sum(mp, LIFT[n,mp]*fluxw[mp]) rhsp[k,n] = rhsp[k,n] + sum(mp, LIFT[n,mp]*fluxp[mp]) """, [ lp.GlobalArg("u,v,w,p", dtype, shape="K, Np", order="C"), lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="F"), "...", ], name="dg_surface", assumptions="K>=1", defines=dict(Np=ldis.Np, Nfp=ldis.Nfp, NfpNfaces=NfpNfaces), options=dict(no_numpy=True, cl_build_options=build_options)) def transform_surface_kernel(knl): #print knl knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0")) knl = lp.split_iname(knl, "mp", 4, inner_tag="unr") knl = lp.add_prefetch(knl, "LIFT") for name in ["nx", "ny", "nz", "Fscale", "bc"]: knl = lp.add_prefetch(knl, name) knl = lp.set_loop_priority(knl, "mp_outer,mp_inner") return knl self.surface_kernel = transform_surface_kernel(surface_kernel) self.surface_flops = (discr.K * ( NfpNfaces*15 + 4*discr.ldis.Np*NfpNfaces*2 ))
def __init__(self, queue, cl_discr_info, dtype=np.float64, profile=False): context = queue.context discr = self.discr = cl_discr_info.discr self.cl_discr_info = cl_discr_info self.profile = profile import pyopencl as cl import pyopencl.array # noqa dtype4 = cl.array.vec.types[np.dtype(dtype), 4] ldis = discr.ldis from pyopencl.characterize import get_fast_inaccurate_build_options build_options = get_fast_inaccurate_build_options(context.devices[0]) # {{{ volume kernel import loopy as lp volume_kernel = lp.make_kernel( [ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ <> du_drst = sum(m, DrDsDt[n,m]*u[k,m]) <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m]) <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m]) <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m]) rhsu[k,n] = - dot(drst_dx[k],dp_drst) rhsv[k,n] = - dot(drst_dy[k],dp_drst) rhsw[k,n] = - dot(drst_dz[k],dp_drst) rhsp[k,n] = - (dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \ + dot(drst_dz[k], dw_drst)) """, [ lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="F"), "...", ], name="dg_volume", assumptions="K>=1", defines=dict(Np=discr.ldis.Np), options=dict(no_numpy=True, cl_build_options=build_options)) def transform_vol(knl): knl = lp.tag_inames(knl, dict(n="l.0", k="g.0")) #knl = lp.change_arg_to_image(knl, "DrDsDt") # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name) for name in ["drst_dx", "drst_dy", "drst_dz"]: knl = lp.add_prefetch(knl, "%s" % name) knl = lp.add_prefetch(knl, "DrDsDt") return knl self.volume_kernel = transform_vol(volume_kernel) self.volume_flops = discr.K * (( 4 # num components * 3 * discr.ldis.Np**2 * 2) + ((3 * 2 - 1) * discr.ldis.Np * 6) + 2) self.volume_bytes = np.dtype(dtype).itemsize * discr.K * ( ( 4 # num components * 2 # load, store * discr.ldis.Np) + # matrix 3 * discr.ldis.Np**2 + # geometric factors 6 * 4) # }}} # {{{ surface kernel NfpNfaces = ldis.Nfaces * ldis.Nfp surface_kernel = lp.make_kernel( "{[m,mp,n,k]: 0<= m,mp < NfpNfaces and 0<= n < Np and 0<= k < K }", """ <> idP = vmapP[k,m] <> idM = vmapM[k,m] <> du = u[[idP]]-u[[idM]] <> dv = v[[idP]]-v[[idM]] <> dw = w[[idP]]-w[[idM]] <> dp = bc[k,m]*p[[idP]] - p[[idM]] <> dQ = 0.5*Fscale[k,m]* \ (dp - nx[k,m]*du - ny[k,m]*dv - nz[k,m]*dw) <> fluxu[m] = -nx[k,m]*dQ <> fluxv[m] = -ny[k,m]*dQ <> fluxw[m] = -nz[k,m]*dQ <> fluxp[m] = dQ # reduction here rhsu[k,n] = rhsu[k,n] + sum(mp, LIFT[n,mp]*fluxu[mp]) rhsv[k,n] = rhsv[k,n] + sum(mp, LIFT[n,mp]*fluxv[mp]) rhsw[k,n] = rhsw[k,n] + sum(mp, LIFT[n,mp]*fluxw[mp]) rhsp[k,n] = rhsp[k,n] + sum(mp, LIFT[n,mp]*fluxp[mp]) """, [ lp.GlobalArg("u,v,w,p", dtype, shape="K, Np", order="C"), lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="F"), "...", ], name="dg_surface", assumptions="K>=1", defines=dict(Np=ldis.Np, Nfp=ldis.Nfp, NfpNfaces=NfpNfaces), options=dict(no_numpy=True, cl_build_options=build_options)) def transform_surface_kernel(knl): #print knl knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0")) knl = lp.split_iname(knl, "mp", 4, inner_tag="unr") knl = lp.add_prefetch(knl, "LIFT") for name in ["nx", "ny", "nz", "Fscale", "bc"]: knl = lp.add_prefetch(knl, name) knl = lp.set_loop_priority(knl, "mp_outer,mp_inner") return knl self.surface_kernel = transform_surface_kernel(surface_kernel) self.surface_flops = ( discr.K * (NfpNfaces * 15 + 4 * discr.ldis.Np * NfpNfaces * 2))