Пример #1
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(
            hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Пример #2
0
    def __call__(self, ary):
        from meshmode.dof_array import DOFArray
        if not isinstance(ary, DOFArray):
            raise TypeError("non-array passed to discretization connection")

        actx = ary.array_context

        @memoize_in(actx, (
            DirectDiscretizationConnection,
            "resample_by_mat_knl",
            self.is_surjective,
        ))
        def mat_knl():
            if self.is_surjective:
                domains = [
                    """
                        {[iel, idof, j]:
                        0<=iel<nelements and
                        0<=idof<n_to_nodes and
                        0<=j<n_from_nodes}
                        """,
                ]

                instructions = """
                result[to_element_indices[iel], idof] \
                        = sum(j, resample_mat[idof, j] \
                        * ary[from_element_indices[iel], j])
                        """
            else:
                domains = [
                    """
                        {[iel_init, idof_init]:
                        0<=iel_init<nelements_result and
                        0<=idof_init<n_to_nodes}
                        """,
                    """
                        {[iel, idof, j]:
                        0<=iel<nelements and
                        0<=idof<n_to_nodes and
                        0<=j<n_from_nodes}
                        """,
                ]

                instructions = """
                result[iel_init, idof_init] = 0 {id=init}
                ... gbarrier {id=barrier, dep=init}
                result[to_element_indices[iel], idof] \
                        = sum(j, resample_mat[idof, j] \
                        * ary[from_element_indices[iel], j]) {dep=barrier}
                        """
            knl = make_loopy_program(
                domains,
                instructions, [
                    lp.GlobalArg("result",
                                 None,
                                 shape="nelements_result, n_to_nodes",
                                 offset=lp.auto),
                    lp.GlobalArg("ary",
                                 None,
                                 shape="nelements_vec, n_from_nodes",
                                 offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    lp.ValueArg("n_from_nodes", np.int32),
                    "...",
                ],
                name="resample_by_mat")

            return knl

        @memoize_in(actx, (DirectDiscretizationConnection,
                           "resample_by_picking_knl", self.is_surjective))
        def pick_knl():
            if self.is_surjective:
                domains = [
                    """{[iel, idof]:
                        0<=iel<nelements and
                        0<=idof<n_to_nodes}"""
                ]
                instructions = """
                result[to_element_indices[iel], idof] \
                    = ary[from_element_indices[iel], pick_list[idof]]
                """
            else:
                domains = [
                    """
                        {[iel_init, idof_init]:
                        0<=iel_init<nelements_result and
                        0<=idof_init<n_to_nodes}
                        """, """
                        {[iel, idof]:
                        0<=iel<nelements and
                        0<=idof<n_to_nodes}
                        """
                ]
                instructions = """
                result[iel_init, idof_init] = 0 {id=init}
                ... gbarrier {id=barrier, dep=init}
                result[to_element_indices[iel], idof] \
                    = ary[from_element_indices[iel], pick_list[idof]] {dep=barrier}
                """
            knl = make_loopy_program(
                domains,
                instructions, [
                    lp.GlobalArg("result",
                                 None,
                                 shape="nelements_result, n_to_nodes",
                                 offset=lp.auto),
                    lp.GlobalArg("ary",
                                 None,
                                 shape="nelements_vec, n_from_nodes",
                                 offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    lp.ValueArg("n_from_nodes", np.int32),
                    "...",
                ],
                name="resample_by_picking")

            return knl

        if ary.shape != (len(self.from_discr.groups), ):
            raise ValueError("invalid shape of incoming resampling data")

        group_idx_to_result = []

        for i_tgrp, (tgrp,
                     cgrp) in enumerate(zip(self.to_discr.groups,
                                            self.groups)):

            kernels = []  # get kernels for each batch; to be fused eventually
            kwargs = {}  # kwargs to the fused kernel
            for i_batch, batch in enumerate(cgrp.batches):
                if batch.from_element_indices.size == 0:
                    continue

                point_pick_indices = self._resample_point_pick_indices(
                    actx, i_tgrp, i_batch)

                if point_pick_indices is None:
                    knl = mat_knl()
                    knl = lp.rename_argument(knl, "resample_mat",
                                             f"resample_mat_{i_batch}")
                    kwargs[f"resample_mat_{i_batch}"] = (self._resample_matrix(
                        actx, i_tgrp, i_batch))
                else:
                    knl = pick_knl()
                    knl = lp.rename_argument(knl, "pick_list",
                                             f"pick_list_{i_batch}")
                    kwargs[f"pick_list_{i_batch}"] = point_pick_indices

                # {{{ enforce different namespaces for the kernels

                for iname in knl.all_inames():
                    knl = lp.rename_iname(knl, iname, f"{iname}_{i_batch}")

                knl = lp.rename_argument(knl, "ary", f"ary_{i_batch}")
                knl = lp.rename_argument(knl, "from_element_indices",
                                         f"from_element_indices_{i_batch}")
                knl = lp.rename_argument(knl, "to_element_indices",
                                         f"to_element_indices_{i_batch}")
                knl = lp.rename_argument(knl, "nelements",
                                         f"nelements_{i_batch}")

                # }}}

                kwargs[f"ary_{i_batch}"] = ary[batch.from_group_index]
                kwargs[f"from_element_indices_{i_batch}"] = (
                    batch.from_element_indices)
                kwargs[f"to_element_indices_{i_batch}"] = (
                    batch.to_element_indices)

                kernels.append(knl)

            fused_knl = lp.fuse_kernels(kernels)
            # order of operations doesn't matter
            fused_knl = lp.add_nosync(fused_knl,
                                      "global",
                                      "writes:result",
                                      "writes:result",
                                      bidirectional=True,
                                      force=True)

            result_dict = actx.call_loopy(fused_knl,
                                          nelements_result=tgrp.nelements,
                                          n_to_nodes=tgrp.nunit_dofs,
                                          **kwargs)

            group_idx_to_result.append(result_dict["result"])

        from meshmode.dof_array import DOFArray
        return DOFArray.from_list(actx, group_idx_to_result)
Пример #3
0
def test_write_block_matrix_fusion(ctx_factory):
    """
    A slightly more complicated fusion test, where all
    sub-kernels write into the same global matrix, but
    in well-defined separate blocks. This tests makes sure
    data flow specification is preserved during fusion for
    matrix-assembly-like programs.
    """

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    def init_global_mat_prg():
        return lp.make_kernel(
            ["{[idof]: 0 <= idof < n}", "{[jdof]: 0 <= jdof < m}"],
            """
                result[idof, jdof]  = 0 {id=init}
            """,
            [
                lp.GlobalArg("result", None, shape="n, m", offset=lp.auto),
                lp.ValueArg("n, m", np.int32),
                "...",
            ],
            options=lp.Options(return_dict=True),
            default_offset=lp.auto,
            name="init_a_global_matrix",
        )

    def write_into_mat_prg():
        return lp.make_kernel(
            ["{[idof]: 0 <= idof < ndofs}", "{[jdof]: 0 <= jdof < mdofs}"],
            """
                result[offset_i + idof, offset_j + jdof] = mat[idof, jdof]
            """,
            [
                lp.GlobalArg("result", None, shape="n, m", offset=lp.auto),
                lp.ValueArg("n, m", np.int32),
                lp.GlobalArg("mat", None, shape="ndofs, mdofs",
                             offset=lp.auto),
                lp.ValueArg("offset_i", np.int32),
                lp.ValueArg("offset_j", np.int32),
                "...",
            ],
            options=lp.Options(return_dict=True),
            default_offset=lp.auto,
            name="write_into_global_matrix",
        )

    # Construct a 2x2 diagonal matrix with
    # random 5x5 blocks on the block-diagonal,
    # and zeros elsewhere
    n = 10
    block_n = 5
    mat1 = np.random.randn(block_n, block_n)
    mat2 = np.random.randn(block_n, block_n)
    answer = np.block([[mat1, np.zeros((block_n, block_n))],
                       [np.zeros((block_n, block_n)), mat2]])
    kwargs = {"n": n, "m": n}

    # Do some renaming of individual programs before fusion
    kernels = [init_global_mat_prg()]
    for idx, (offset, mat) in enumerate([(0, mat1), (block_n, mat2)]):
        knl = lp.rename_argument(write_into_mat_prg(), "mat", f"mat_{idx}")
        kwargs[f"mat_{idx}"] = mat

        for iname in knl.default_entrypoint.all_inames():
            knl = lp.rename_iname(knl, iname, f"{iname}_{idx}")

        knl = lp.rename_argument(knl, "ndofs", f"ndofs_{idx}")
        knl = lp.rename_argument(knl, "mdofs", f"mdofs_{idx}")
        kwargs[f"ndofs_{idx}"] = block_n
        kwargs[f"mdofs_{idx}"] = block_n

        knl = lp.rename_argument(knl, "offset_i", f"offset_i_{idx}")
        knl = lp.rename_argument(knl, "offset_j", f"offset_j_{idx}")
        kwargs[f"offset_i_{idx}"] = offset
        kwargs[f"offset_j_{idx}"] = offset

        kernels.append(knl)

    fused_knl = lp.fuse_kernels(
        kernels,
        data_flow=[("result", 0, 1), ("result", 1, 2)],
    )
    fused_knl = lp.add_nosync(fused_knl,
                              "global",
                              "writes:result",
                              "writes:result",
                              bidirectional=True,
                              force=True)
    evt, result = fused_knl(queue, **kwargs)
    result = result["result"]
    np.testing.assert_allclose(result, answer)
Пример #4
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
           knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
           if "KernelR" in knl.name or "KernelS" in knl.name
           ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (
          fix_euler_parameters,
          set_q_storage_format, set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner",)
        flux_ilp_inames = ("kk",)
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
                  ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)),
                  ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)),
                  ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(hsv,
                  "tag:{knl_tag} and reads:{flux_var}"
                  .format(knl_tag=knl_tag, flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames,
                temporary_name=flux_store_name,
                precompute_inames=flux_precomp_inames + flux_ilp_inames,
                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id,
                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
            lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
          fetch_bounding_box=True, default_tag="for",
          init_expression="0", store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv,
            {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"},
            ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(hsv, dict(
          rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec",
          rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
          Q_dim_field_inner="vec",
          Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf",
          vary_by_axes=(0,) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv, cl_build_options=[
         "-cl-denorms-are-zero",
         "-cl-fast-relaxed-math",
         "-cl-finite-math-only",
         "-cl-mad-enable",
         "-cl-no-signed-zeros",
         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
            quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Пример #5
0
def test_map_domain_transform_map_validity_and_errors(ctx_factory):

    # {{{ Make kernel

    knl = lp.make_kernel(
        [
            "[nx,nt] -> {[x, y, z, t]: 0 <= x,y,z < nx and 0 <= t < nt}",
            "[m] -> {[j]: 0 <= j < m}",
        ],
        """
        a[y,x,t,z] = b[y,x,t,z]  {id=stmta}
        for j
            <>temp = j  {dep=stmta}
        end
        """,
        lang_version=(2018, 2),
    )
    knl = lp.add_and_infer_dtypes(knl, {"b": np.float32})
    ref_knl = knl

    # }}}

    # {{{ Make sure map_domain *succeeds* when map includes 2 of 4 dims in one
    # domain.

    # {{{ Apply domain change mapping that splits t and renames y; (similar to
    # split_iname test above, but doesn't hurt to test this slightly different
    # scenario)

    knl_map_dom = ref_knl

    # Create map_domain mapping that only includes t and y
    # (x and z should be unaffected)
    import islpy as isl
    transform_map = isl.BasicMap(
        "[nx,nt] -> {[t, y] -> [t_outer, t_inner, y_new]: "
        "0 <= t_inner < 16 and "
        "16*t_outer + t_inner = t and "
        "0 <= 16*t_outer + t_inner < nt and "
        "y = y_new"
        "}")

    # Call map_domain to transform kernel; this should *not* produce an error
    knl_map_dom = lp.map_domain(knl_map_dom, transform_map)

    # Prioritize loops
    desired_prio = "x, t_outer, t_inner, z, y_new"

    # Use constrain_loop_nesting if it's available
    cln_attr = getattr(lp, "constrain_loop_nesting", None)
    if cln_attr is not None:
        knl_map_dom = lp.constrain_loop_nesting(  # noqa pylint:disable=no-member
            knl_map_dom, desired_prio)
    else:
        knl_map_dom = lp.prioritize_loops(knl_map_dom, desired_prio)

    # Get a linearization
    proc_knl_map_dom = lp.preprocess_kernel(knl_map_dom)
    lin_knl_map_dom = lp.get_one_linearized_kernel(
        proc_knl_map_dom["loopy_kernel"], proc_knl_map_dom.callables_table)

    # }}}

    # {{{ Use split_iname and rename_iname, and make sure we get the same result

    knl_split_iname = ref_knl
    knl_split_iname = lp.split_iname(knl_split_iname, "t", 16)
    knl_split_iname = lp.rename_iname(knl_split_iname, "y", "y_new")
    try:
        # Use constrain_loop_nesting if it's available
        knl_split_iname = lp.constrain_loop_nesting(knl_split_iname,
                                                    desired_prio)
    except AttributeError:
        knl_split_iname = lp.prioritize_loops(knl_split_iname, desired_prio)
    proc_knl_split_iname = lp.preprocess_kernel(knl_split_iname)
    lin_knl_split_iname = lp.get_one_linearized_kernel(
        proc_knl_split_iname["loopy_kernel"],
        proc_knl_split_iname.callables_table)

    for d_map_domain, d_split_iname in zip(
            knl_map_dom["loopy_kernel"].domains,
            knl_split_iname["loopy_kernel"].domains):
        d_map_domain_aligned = _ensure_dim_names_match_and_align(
            d_map_domain, d_split_iname)
        assert d_map_domain_aligned == d_split_iname

    for litem_map_domain, litem_split_iname in zip(
            lin_knl_map_dom.linearization, lin_knl_split_iname.linearization):
        assert litem_map_domain == litem_split_iname

    # Can't easily compare instructions because equivalent subscript
    # expressions may have different orders

    lp.auto_test_vs_ref(proc_knl_split_iname,
                        ctx_factory(),
                        proc_knl_map_dom,
                        parameters={
                            "nx": 32,
                            "nt": 32,
                            "m": 32
                        })

    # }}}

    # }}}

    # {{{ Make sure we error on a map that is not bijective

    # Not bijective
    transform_map = isl.BasicMap(
        "[nx,nt] -> {[t, y, rogue] -> [t_new, y_new]: "
        "y = y_new and t = t_new"
        "}")

    from loopy.diagnostic import LoopyError
    knl = ref_knl
    try:
        knl = lp.map_domain(knl, transform_map)
        raise AssertionError()
    except LoopyError as err:
        assert "map must be bijective" in str(err)

    # }}}

    # {{{ Make sure there's an error if transform map does not apply to
    # exactly one domain.

    test_maps = [
        # Map where some inames match exactly one domain but there's also a
        # rogue dim
        isl.BasicMap("[nx,nt] -> {[t, y, rogue] -> [t_new, y_new, rogue_new]: "
                     "y = y_new and t = t_new and rogue = rogue_new"
                     "}"),
        # Map where all inames match exactly one domain but there's also a
        # rogue dim
        isl.BasicMap("[nx,nt] -> {[t, y, x, z, rogue] -> "
                     "[t_new, y_new, x_new, z_new, rogue_new]: "
                     "y = y_new and t = t_new and x = x_new and z = z_new "
                     "and rogue = rogue_new"
                     "}"),
        # Map where no inames match any domain
        isl.BasicMap("[nx,nt] -> {[rogue] -> [rogue_new]: "
                     "rogue = rogue_new"
                     "}"),
    ]

    for transform_map in test_maps:
        try:
            knl = lp.map_domain(knl, transform_map)
            raise AssertionError()
        except LoopyError as err:
            assert ("was not applicable to any domain. "
                    "Transform map must be applicable to exactly one domain."
                    in str(err))

    # }}}

    # {{{ Make sure there's an error if we try to map inames in priorities

    knl = ref_knl
    knl = lp.prioritize_loops(knl, "y, z")
    knl = lp.prioritize_loops(knl, "x, z")
    try:
        transform_map = isl.BasicMap("[nx,nt] -> {[t, y] -> [t_new, y_new]: "
                                     "y = y_new and t = t_new }")
        knl = lp.map_domain(knl, transform_map)
        raise AssertionError()
    except ValueError as err:
        assert ("Loop priority ('y', 'z') contains iname(s) "
                "transformed by map" in str(err))

    # }}}

    # {{{ Make sure we error when stmt.within_inames contains at least one but
    # not all mapped inames

    # {{{ Make potentially problematic kernel

    knl = lp.make_kernel(
        [
            "[n, m] -> { [i, j]: 0 <= i < n and 0 <= j < m }",
            "[ell] -> { [k]: 0 <= k < ell }",
        ],
        """
        for i
            <>t0 = i  {id=stmt0}
            for j
                <>t1 = j  {id=stmt1, dep=stmt0}
            end
            <>t2 = i + 1  {id=stmt2, dep=stmt1}
        end
        for k
           <>t3 = k  {id=stmt3, dep=stmt2}
        end
        """,
        lang_version=(2018, 2),
    )

    # }}}

    # This should fail:
    try:
        transform_map = isl.BasicMap("[n, m] -> {[i, j] -> [i_new, j_new]: "
                                     "i_new = i + j and j_new = 2 + i }")
        knl = lp.map_domain(knl, transform_map)
        raise AssertionError()
    except LoopyError as err:
        assert ("Statements must be within all or none of the mapped inames"
                in str(err))

    # This should succeed:
    transform_map = isl.BasicMap("[n, m] -> {[i] -> [i_new]: i_new = i + 2 }")
    knl = lp.map_domain(knl, transform_map)