Exemplo n.º 1
0
 def kernel(self, target: TargetBase, typed: bool = True) -> LoopKernel:
     "Build and return loop kernel."
     domains = self.kernel_domains()
     body = '\n'.join(self.kernel_isns())
     data = self.kernel_data()
     knl = make_kernel(domains, body, data, target=target)
     knl = make_reduction_inames_unique(knl)
     knl.name = self.__class__.__name__
     if typed:
         dtypes = self.kernel_dtypes()
         knl = add_and_infer_dtypes(knl, dtypes)
     return knl
Exemplo n.º 2
0
def test_double_sum_made_unique(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 20

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", [
        "a = sum((i,j), i*j)",
        "b = sum(i, sum(j, i*j))",
    ],
                         assumptions="n>=1")

    knl = lp.make_reduction_inames_unique(knl)
    print(knl)

    evt, (a, b) = knl(queue, n=n)

    ref = sum(i * j for i in range(n) for j in range(n))
    assert a.get() == ref
    assert b.get() == ref
Exemplo n.º 3
0
def test_double_sum_made_unique(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 20

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<n }",
            [
                "a = sum((i,j), i*j)",
                "b = sum(i, sum(j, i*j))",
                ],
            assumptions="n>=1")

    knl = lp.make_reduction_inames_unique(knl)
    print(knl)

    evt, (a, b) = knl(queue, n=n)

    ref = sum(i*j for i in range(n) for j in range(n))
    assert a.get() == ref
    assert b.get() == ref
Exemplo n.º 4
0
def fixup_utoprim(params, pflag, G, P):
    sh = G.shapes
    s = G.slices

    if params['debug']:
        nbad_utop = np.sum(pflag.get()[s.bulk] != 0)
        print("Fixing {} bad cells".format(nbad_utop))

    # Make sure we are not using ill defined physical corner regions
    # TODO can this be forgotten?  U_to_P only updates the bulk, and bounds should not touch physical corners
    #zero_corners(params, G, pflag)

    sum = cl_array.zeros(params['queue'], sh.grid_primitives, dtype=np.float64)
    wsum = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.float64)

    global knl_fixup_utoprim_sums, knl_fixup_utoprim_fix
    if knl_fixup_utoprim_sums is None:
        # TODO these should really be combined and the check on wsum inlined
        # That's gonna be a project
        code_sums = add_ghosts("""
        # TODO if statements here to speed up evaluation?
        w(l, m, n) := not(pflag[i+l,j+m,k+n]) / (abs(l) + abs(m) + abs(n) + 1)
        wsum[i, j, k] = reduce(sum, (l,m,n), w(l,m,n))
        sum[p, i, j, k] = reduce(sum, (l,m,n), w(l,m,n) * P[p, i+l, j+m, k+n])
        """)
        code_fixup = add_ghosts("""
        P[p, i, j, k] = if(pflag[i, j, k] == 0, P[p, i, j, k], sum[p, i, j, k] / wsum[i, j, k])
        """)
        knl_fixup_utoprim_sums = lp.make_kernel(sh.isl_grid_primitives_fixup, code_sums,
                                                [*primsArrayArgs("P", "sum"), *scalarArrayArgs("wsum"),
                                                 *scalarArrayArgs("pflag", dtype=np.int32)],
                                                assumptions=sh.assume_grid)
        knl_fixup_utoprim_sums = spec_prims_kernel(knl_fixup_utoprim_sums, sh.bulk_primitives, ng=G.NG)
        # Roll our own optimization here as this is the only convolution kernel we got
        knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "k", 8, outer_tag="g.0", inner_tag="l.0")
        knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "j", 8, outer_tag="g.1", inner_tag="l.1")
        knl_fixup_utoprim_sums = lp.split_iname(knl_fixup_utoprim_sums, "i", 8, outer_tag="g.2", inner_tag="l.2")
        knl_fixup_utoprim_sums = lp.make_reduction_inames_unique(knl_fixup_utoprim_sums)

        # TODO these are some feisty prefetches. Leaving them for later
        # knl_fixup_utoprim_sums = lp.tag_inames(knl_fixup_utoprim_sums, "p:unr")
        # knl_fixup_utoprim_sums = lp.add_prefetch(knl_fixup_utoprim_sums, "pflag", "i_inner,j_inner,k_inner",
        #                                          default_tag="l.auto")
        # knl_fixup_utoprim_sums = lp.add_prefetch(knl_fixup_utoprim_sums, "P", "i_inner,j_inner,k_inner,l,m,n",
        #                                          default_tag="l.auto")

        # TODO The prefetches on this are not working either, look at that
        knl_fixup_utoprim_fix = lp.make_kernel(sh.isl_grid_primitives, code_fixup,
                                                [*primsArrayArgs("P", "sum"), *scalarArrayArgs("wsum"),
                                                 *scalarArrayArgs("pflag", dtype=np.int32)],
                                               assumptions=sh.assume_grid)
        knl_fixup_utoprim_fix = tune_prims_kernel(knl_fixup_utoprim_fix, shape=sh.bulk_primitives, ng=G.NG)
        print("Compiled fixup_utoprim")

    evt, _ = knl_fixup_utoprim_sums(params['queue'], P=P, pflag=pflag, sum=sum, wsum=wsum)
    evt.wait()
    if params['debug']:
        if np.any(wsum.get()[s.bulk] < 1.e-10):
            # TODO don't die on this when we hit prod
            raise ValueError("fixup_utoprim found no usable neighbors!")
    evt, _ = knl_fixup_utoprim_fix(params['queue'], P=P, pflag=pflag, sum=sum, wsum=wsum)

    if params['debug']:
        # TODO count what we fixed
        nleft_utop = nbad_utop - nbad_utop
        if nleft_utop > 0:
            print("Cells STILL BAD after fixup_utoprim: {}".format(nleft_utop))

    # Reset the pflag, because we tried our best and that's what counts
    # TODO necessary? See above about new copy
    #pflag.fill(0)

    return P
Exemplo n.º 5
0
def generate_loopy(
    result: Union[Array, DictOfNamedArrays, Dict[str, Array]],
    target: Optional[LoopyTarget] = None,
    options: Optional[lp.Options] = None,
    *,
    cl_device: Optional["pyopencl.Device"] = None,
    array_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset(
        [ImplStored, Named, PrefixNamed]),
    axis_tag_t_to_not_propagate: FrozenSet[Type[Tag]] = frozenset(),
) -> BoundProgram:
    r"""Code generation entry point.

    :param result: Outputs of the computation.
    :param target: Code generation target.
    :param options: Code generation options for the kernel.
    :returns: A :class:`pytato.target.BoundProgram` wrapping the generated
        :mod:`loopy` program.

    If *result* is a :class:`dict` or a :class:`pytato.DictOfNamedArrays` and
    *options* is not supplied, then the Loopy option
    :attr:`~loopy.Options.return_dict` will be set to *True*. If it is supplied,
    :attr:`~loopy.Options.return_dict` must already be set to *True*.

    .. note::

        :mod:`pytato` metadata :math:`\mapsto` :mod:`loopy` metadata semantics:

        - Inames that index over an :class:`~pytato.array.Array`'s axis in the
          allocation instruction are tagged with the corresponding
          :class:`~pytato.array.Axis`'s tags. The caller may choose to not
          propagate axis tags of type *axis_tag_t_to_not_propagate*.
        - :attr:`pytato.Array.tags` of inputs/outputs in *outputs*
          would be copied over to the tags of the corresponding
          :class:`loopy.ArrayArg`. The caller may choose to not
          propagate array tags of type *array_tag_t_to_not_propagate*.
        - Arrays tagged with :class:`pytato.tags.ImplStored` would have their
          tags copied over to the tags of corresponding
          :class:`loopy.TemporaryVariable`. The caller may choose to not
          propagate array tags of type *array_tag_t_to_not_propagate*.
    """

    result_is_dict = isinstance(result, (dict, DictOfNamedArrays))
    orig_outputs: DictOfNamedArrays = normalize_outputs(result)
    del result

    if target is None:
        target = LoopyPyOpenCLTarget(device=cl_device)
    else:
        if cl_device is not None:
            raise TypeError("may not pass both 'target' and 'cl_device'")

    preproc_result = preprocess(orig_outputs, target)
    outputs = preproc_result.outputs
    compute_order = preproc_result.compute_order

    if options is None:
        options = lp.Options(return_dict=result_is_dict)
    elif isinstance(options, dict):
        from warnings import warn
        warn(
            "Passing a dict for options is deprecated and will stop working in "
            "2022. Pass an actual loopy.Options object instead.",
            DeprecationWarning,
            stacklevel=2)
        options = lp.Options(**options)

    if options.return_dict != result_is_dict:
        raise ValueError("options.result_is_dict is expected to match "
                         "whether the returned value is a dictionary")

    state = get_initial_codegen_state(target, options)

    from pytato.transform import InputGatherer
    ing = InputGatherer()

    state.var_name_gen.add_names({
        input_expr.name
        for name in compute_order for input_expr in ing(outputs[name].expr)
        if isinstance(input_expr, (Placeholder, SizeParam, DataWrapper))
        if input_expr.name is not None
    })

    state.var_name_gen.add_names(outputs)

    cg_mapper = CodeGenMapper(array_tag_t_to_not_propagate,
                              axis_tag_t_to_not_propagate)

    # Generate code for outputs.
    for name in compute_order:
        expr = outputs[name].expr
        insn_id = add_store(name, expr, cg_mapper(expr, state), state,
                            cg_mapper)
        # replace "expr" with the created stored variable
        state.results[expr] = StoredResult(name, expr.ndim,
                                           frozenset([insn_id]))

    # Why call make_reduction_inames_unique?
    # Consider pt.generate_loopy(pt.sum(x) + pt.sum(x)), the generated program
    # would be a single instruction with rhs: `_pt_subst() + _pt_subst()`.
    # The result of pt.sum(x) is cached => same instance of InlinedResult is
    # emitted for both invocations and we would be required to avoid such
    # reduction iname collisions.
    program = lp.make_reduction_inames_unique(state.program)

    return target.bind_program(program=program,
                               bound_arguments=preproc_result.bound_arguments)