Exemplo n.º 1
0
def compile_model(context, source, dtype, fast=False):
    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
    """
    Build a model to run on the gpu.

    Returns the compiled program and its type.

    Raises an error if the desired precision is not available.
    """
    dtype = np.dtype(dtype)
    if not all(has_type(d, dtype) for d in context.devices):
        raise RuntimeError("%s not supported for devices"%dtype)

    source_list = [generate.convert_type(source, dtype)]

    if dtype == generate.F16:
        source_list.insert(0, _F16_PRAGMA)
    elif dtype == generate.F64:
        source_list.insert(0, _F64_PRAGMA)

    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
    if context.devices[0].type == cl.device_type.GPU:
        source_list.insert(0, "#define USE_SINCOS\n")
    options = (get_fast_inaccurate_build_options(context.devices[0])
               if fast else [])
    source = "\n".join(source_list)
    program = cl.Program(context, source).build(options=options)

    #print("done with "+program)
    return program
Exemplo n.º 2
0
def compile_model(context, source, dtype, fast=False):
    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
    """
    Build a model to run on the gpu.

    Returns the compiled program and its type.  The returned type will
    be float32 even if the desired type is float64 if any of the
    devices in the context do not support the cl_khr_fp64 extension.
    """
    dtype = np.dtype(dtype)
    if not all(has_type(d, dtype) for d in context.devices):
        raise RuntimeError("%s not supported for devices" % dtype)

    source_list = [generate.convert_type(source, dtype)]

    if dtype == generate.F16:
        source_list.insert(0, _F16_PRAGMA)
    elif dtype == generate.F64:
        source_list.insert(0, _F64_PRAGMA)

    # Note: USE_SINCOS makes the intel cpu slower under opencl
    if context.devices[0].type == cl.device_type.GPU:
        source_list.insert(0, "#define USE_SINCOS\n")
    options = (get_fast_inaccurate_build_options(context.devices[0])
               if fast else [])
    source = "\n".join(source_list)
    program = cl.Program(context, source).build(options=options)
    #print("done with "+program)
    return program
Exemplo n.º 3
0
def compile_model(context, source, dtype, fast=False):
    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
    """
    Build a model to run on the gpu.

    Returns the compiled program and its type.  The returned type will
    be float32 even if the desired type is float64 if any of the
    devices in the context do not support the cl_khr_fp64 extension.
    """
    dtype = np.dtype(dtype)
    if not all(has_type(d, dtype) for d in context.devices):
        raise RuntimeError("%s not supported for devices"%dtype)

    source_list = [generate.convert_type(source, dtype)]

    if dtype == generate.F16:
        source_list.insert(0, _F16_PRAGMA)
    elif dtype == generate.F64:
        source_list.insert(0, _F64_PRAGMA)

    # Note: USE_SINCOS makes the intel cpu slower under opencl
    if context.devices[0].type == cl.device_type.GPU:
        source_list.insert(0, "#define USE_SINCOS\n")
    options = (get_fast_inaccurate_build_options(context.devices[0])
               if fast else [])
    source = "\n".join(source_list)
    program = cl.Program(context, source).build(options=options)
    #print("done with "+program)
    return program
Exemplo n.º 4
0
    cl_elementwise = opencl_profile.cl_elementwise
    cl_scan = opencl_profile.cl_scan    
    cl_random = opencl_profile.cl_random    
    profile_properties = int(cl.command_queue_properties.PROFILING_ENABLE)

import pyopencl.characterize as clchar;


platform = cl.get_platforms()[openc_platform_index]
device = platform.get_devices()[openc_device_index];

print "Selecting platform: %s. device: %s" % (platform,device)

_ctx = cl.Context(platform.get_devices()); 

fast_clargs = clchar.get_fast_inaccurate_build_options(device)
work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES);


def get_a_context(cpu=False):
    if cpu:
        return cl.Context(cl.get_platforms()[1].get_devices())
        
    return _ctx;
    
_inkernal_reduction_template = Template("""


        if ($blockSize >= 1024) 
        { 
            if ($offset < 512) { $sdata[$offset] += $sdata[$offset + 512*($stepsize)]; } 
    cl_reduction = opencl_profile.cl_reduction
    cl_elementwise = opencl_profile.cl_elementwise
    cl_scan = opencl_profile.cl_scan
    cl_random = opencl_profile.cl_random
    profile_properties = int(cl.command_queue_properties.PROFILING_ENABLE)

import pyopencl.characterize as clchar

platform = cl.get_platforms()[openc_platform_index]
device = platform.get_devices()[openc_device_index]

print "Selecting platform: %s. device: %s" % (platform, device)

_ctx = cl.Context(platform.get_devices())

fast_clargs = clchar.get_fast_inaccurate_build_options(device)
work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES)


def get_a_context(cpu=False):
    if cpu:
        return cl.Context(cl.get_platforms()[1].get_devices())

    return _ctx


_inkernal_reduction_template = Template("""


        if ($blockSize >= 1024) 
        { 
Exemplo n.º 6
0
    def __init__(self, queue, cl_discr_info, dtype=np.float64,
            profile=False):
        context = queue.context
        discr = self.discr = cl_discr_info.discr
        self.cl_discr_info = cl_discr_info

        self.profile = profile

        import pyopencl as cl
        import pyopencl.array  # noqa

        dtype4 = cl.array.vec.types[np.dtype(dtype), 4]

        ldis = discr.ldis

        from pyopencl.characterize import get_fast_inaccurate_build_options
        build_options = get_fast_inaccurate_build_options(context.devices[0])

        # {{{ volume kernel

        import loopy as lp
        volume_kernel = lp.make_kernel([
            "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
            ],
            """
                <> du_drst = sum(m, DrDsDt[n,m]*u[k,m])
                <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m])
                <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m])
                <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m])

                rhsu[k,n] = - dot(drst_dx[k],dp_drst)
                rhsv[k,n] = - dot(drst_dy[k],dp_drst)
                rhsw[k,n] = - dot(drst_dz[k],dp_drst)
                rhsp[k,n] = - (dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \
                    + dot(drst_dz[k], dw_drst))
                """,
            [
                lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="F"),
                "...",
                ],
            name="dg_volume", assumptions="K>=1",
            defines=dict(Np=discr.ldis.Np),
            options=dict(no_numpy=True, cl_build_options=build_options))

        def transform_vol(knl):
            knl = lp.tag_inames(knl, dict(n="l.0", k="g.0"))
            #knl = lp.change_arg_to_image(knl, "DrDsDt")

            # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
            for name in ["u", "v", "w", "p"]:
                knl = lp.add_prefetch(knl, "%s[k,:]" % name)
            for name in ["drst_dx", "drst_dy", "drst_dz"]:
                knl = lp.add_prefetch(knl, "%s" % name)
            knl = lp.add_prefetch(knl, "DrDsDt")
            return knl

        self.volume_kernel = transform_vol(volume_kernel)

        self.volume_flops = discr.K * (
                (
                    4  # num components
                    * 3
                    *discr.ldis.Np**2*2
                    )
                +
                (
                    (3*2-1)*discr.ldis.Np * 6
                    )
                + 2)

        self.volume_bytes = np.dtype(dtype).itemsize * discr.K * (
                (
                    4  # num components
                    * 2  # load, store
                    * discr.ldis.Np
                    )
                +
                # matrix
                3
                * discr.ldis.Np**2
                +
                # geometric factors
                6*4)

        # }}}

        # {{{ surface kernel

        NfpNfaces = ldis.Nfaces*ldis.Nfp

        surface_kernel = lp.make_kernel(
                "{[m,mp,n,k]: 0<= m,mp < NfpNfaces and 0<= n < Np and 0<= k < K }",
                """
                    <> idP = vmapP[k,m]
                    <> idM = vmapM[k,m]

                    <> du = u[[idP]]-u[[idM]]
                    <> dv = v[[idP]]-v[[idM]]
                    <> dw = w[[idP]]-w[[idM]]
                    <> dp = bc[k,m]*p[[idP]] - p[[idM]]

                    <> dQ = 0.5*Fscale[k,m]* \
                            (dp - nx[k,m]*du - ny[k,m]*dv - nz[k,m]*dw)

                    <> fluxu[m] = -nx[k,m]*dQ
                    <> fluxv[m] = -ny[k,m]*dQ
                    <> fluxw[m] = -nz[k,m]*dQ
                    <> fluxp[m] =          dQ

                    # reduction here
                    rhsu[k,n] = rhsu[k,n] + sum(mp, LIFT[n,mp]*fluxu[mp])
                    rhsv[k,n] = rhsv[k,n] + sum(mp, LIFT[n,mp]*fluxv[mp])
                    rhsw[k,n] = rhsw[k,n] + sum(mp, LIFT[n,mp]*fluxw[mp])
                    rhsp[k,n] = rhsp[k,n] + sum(mp, LIFT[n,mp]*fluxp[mp])
                    """,
                [
                    lp.GlobalArg("u,v,w,p", dtype, shape="K, Np", order="C"),
                    lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="F"),
                    "...",
                    ],
                name="dg_surface", assumptions="K>=1",
                defines=dict(Np=ldis.Np, Nfp=ldis.Nfp, NfpNfaces=NfpNfaces),
                options=dict(no_numpy=True, cl_build_options=build_options))

        def transform_surface_kernel(knl):
            #print knl
            knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0"))
            knl = lp.split_iname(knl, "mp", 4, inner_tag="unr")
            knl = lp.add_prefetch(knl, "LIFT")
            for name in ["nx", "ny", "nz", "Fscale", "bc"]:
                knl = lp.add_prefetch(knl, name)
            knl = lp.set_loop_priority(knl, "mp_outer,mp_inner")
            return knl

        self.surface_kernel = transform_surface_kernel(surface_kernel)

        self.surface_flops = (discr.K
                * (
                    NfpNfaces*15
                    +
                    4*discr.ldis.Np*NfpNfaces*2
                    ))
Exemplo n.º 7
0
    def __init__(self, queue, cl_discr_info, dtype=np.float64, profile=False):
        context = queue.context
        discr = self.discr = cl_discr_info.discr
        self.cl_discr_info = cl_discr_info

        self.profile = profile

        import pyopencl as cl
        import pyopencl.array  # noqa

        dtype4 = cl.array.vec.types[np.dtype(dtype), 4]

        ldis = discr.ldis

        from pyopencl.characterize import get_fast_inaccurate_build_options
        build_options = get_fast_inaccurate_build_options(context.devices[0])

        # {{{ volume kernel

        import loopy as lp
        volume_kernel = lp.make_kernel(
            [
                "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
            ],
            """
                <> du_drst = sum(m, DrDsDt[n,m]*u[k,m])
                <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m])
                <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m])
                <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m])

                rhsu[k,n] = - dot(drst_dx[k],dp_drst)
                rhsv[k,n] = - dot(drst_dy[k],dp_drst)
                rhsw[k,n] = - dot(drst_dz[k],dp_drst)
                rhsp[k,n] = - (dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \
                    + dot(drst_dz[k], dw_drst))
                """, [
                lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="F"),
                "...",
            ],
            name="dg_volume",
            assumptions="K>=1",
            defines=dict(Np=discr.ldis.Np),
            options=dict(no_numpy=True, cl_build_options=build_options))

        def transform_vol(knl):
            knl = lp.tag_inames(knl, dict(n="l.0", k="g.0"))
            #knl = lp.change_arg_to_image(knl, "DrDsDt")

            # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
            for name in ["u", "v", "w", "p"]:
                knl = lp.add_prefetch(knl, "%s[k,:]" % name)
            for name in ["drst_dx", "drst_dy", "drst_dz"]:
                knl = lp.add_prefetch(knl, "%s" % name)
            knl = lp.add_prefetch(knl, "DrDsDt")
            return knl

        self.volume_kernel = transform_vol(volume_kernel)

        self.volume_flops = discr.K * ((
            4  # num components
            * 3 * discr.ldis.Np**2 * 2) +
                                       ((3 * 2 - 1) * discr.ldis.Np * 6) + 2)

        self.volume_bytes = np.dtype(dtype).itemsize * discr.K * (
            (
                4  # num components
                * 2  # load, store
                * discr.ldis.Np) +
            # matrix
            3 * discr.ldis.Np**2 +
            # geometric factors
            6 * 4)

        # }}}

        # {{{ surface kernel

        NfpNfaces = ldis.Nfaces * ldis.Nfp

        surface_kernel = lp.make_kernel(
            "{[m,mp,n,k]: 0<= m,mp < NfpNfaces and 0<= n < Np and 0<= k < K }",
            """
                    <> idP = vmapP[k,m]
                    <> idM = vmapM[k,m]

                    <> du = u[[idP]]-u[[idM]]
                    <> dv = v[[idP]]-v[[idM]]
                    <> dw = w[[idP]]-w[[idM]]
                    <> dp = bc[k,m]*p[[idP]] - p[[idM]]

                    <> dQ = 0.5*Fscale[k,m]* \
                            (dp - nx[k,m]*du - ny[k,m]*dv - nz[k,m]*dw)

                    <> fluxu[m] = -nx[k,m]*dQ
                    <> fluxv[m] = -ny[k,m]*dQ
                    <> fluxw[m] = -nz[k,m]*dQ
                    <> fluxp[m] =          dQ

                    # reduction here
                    rhsu[k,n] = rhsu[k,n] + sum(mp, LIFT[n,mp]*fluxu[mp])
                    rhsv[k,n] = rhsv[k,n] + sum(mp, LIFT[n,mp]*fluxv[mp])
                    rhsw[k,n] = rhsw[k,n] + sum(mp, LIFT[n,mp]*fluxw[mp])
                    rhsp[k,n] = rhsp[k,n] + sum(mp, LIFT[n,mp]*fluxp[mp])
                    """, [
                lp.GlobalArg("u,v,w,p", dtype, shape="K, Np", order="C"),
                lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="F"),
                "...",
            ],
            name="dg_surface",
            assumptions="K>=1",
            defines=dict(Np=ldis.Np, Nfp=ldis.Nfp, NfpNfaces=NfpNfaces),
            options=dict(no_numpy=True, cl_build_options=build_options))

        def transform_surface_kernel(knl):
            #print knl
            knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0"))
            knl = lp.split_iname(knl, "mp", 4, inner_tag="unr")
            knl = lp.add_prefetch(knl, "LIFT")
            for name in ["nx", "ny", "nz", "Fscale", "bc"]:
                knl = lp.add_prefetch(knl, name)
            knl = lp.set_loop_priority(knl, "mp_outer,mp_inner")
            return knl

        self.surface_kernel = transform_surface_kernel(surface_kernel)

        self.surface_flops = (
            discr.K * (NfpNfaces * 15 + 4 * discr.ldis.Np * NfpNfaces * 2))