示例#1
0
def add_dependency(kernel, insn_match, dependency):
    """Add the instruction dependency *dependency* to the instructions matched
    by *insn_match*.

    *insn_match* may be any instruction id match understood by
    :func:`loopy.match.parse_match`.
    """

    if dependency not in kernel.id_to_insn:
        raise LoopyError(
            "cannot add dependency on non-existent instruction ID '%s'" %
            dependency)

    def add_dep(insn):
        new_deps = insn.depends_on
        added_deps = frozenset([dependency])
        if new_deps is None:
            new_deps = added_deps
        else:
            new_deps = new_deps | added_deps

        return insn.copy(depends_on=new_deps)

    return map_instructions(kernel, insn_match, add_dep)
示例#2
0
文件: loop.py 项目: shwina/loopy
def generate_unroll_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError(
                "length of unrolled loop '%s' is not a constant, "
                "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(
                build_loop_nest(new_codegen_state, sched_index+1))

    return merge_codegen_results(codegen_state, result)
示例#3
0
文件: check.py 项目: shwina/loopy
def _is_racing_iname_tag(tv, tag):
    from loopy.kernel.data import (temp_var_scope,
            LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)

    if tv.scope == temp_var_scope.PRIVATE:
        return (
                isinstance(tag, ConcurrentTag)
                and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))

    elif tv.scope == temp_var_scope.LOCAL:
        return (
                isinstance(tag, ConcurrentTag)
                and not isinstance(tag, GroupIndexTag))

    elif tv.scope == temp_var_scope.GLOBAL:
        return isinstance(tag, ConcurrentTag)

    elif tv.scope == auto:
        raise LoopyError("scope of temp var '%s' has not yet been"
                "determined" % tv.name)

    else:
        raise ValueError("unexpected value of temp_var.scope for "
                "temporary variable '%s'" % tv.name)
    def map_call(self, expr, return_tuple=False):
        from pymbolic.primitives import Variable

        identifier = expr.function
        if isinstance(identifier, Variable):
            identifier = identifier.name

        if identifier in ["indexof", "indexof_vec"]:
            return [self.kernel.index_dtype]

        def none_if_empty(d):
            if d:
                d, = d
                return d
            else:
                return None

        arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters)
        if None in arg_dtypes:
            return []

        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
        if return_tuple:
            if mangle_result is not None:
                return [mangle_result.result_dtypes]
        else:
            if mangle_result is not None:
                if len(mangle_result.result_dtypes) != 1 and not return_tuple:
                    raise LoopyError("functions with more or fewer than one "
                            "return value may only be used in direct assignments")

                return [mangle_result.result_dtypes[0]]

        raise RuntimeError("unable to resolve "
                "function '%s' with %d given arguments"
                % (identifier, len(arg_dtypes)))
示例#5
0
    def map_reduction(self, expr, return_tuple=False):
        rec_result = self.rec(expr.expr)

        if rec_result:
            rec_result, = rec_result
            result = expr.operation.result_dtypes(self.kernel, rec_result,
                                                  expr.inames)
        else:
            result = expr.operation.result_dtypes(self.kernel, None,
                                                  expr.inames)

        if result is None:
            return []

        if return_tuple:
            return [result]

        else:
            if len(result) != 1 and not return_tuple:
                raise LoopyError(
                    "reductions with more or fewer than one "
                    "return value may only be used in direct assignments")

            return [result[0]]
示例#6
0
def make_ref_args(kernel, impl_arg_info, queue, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    ref_args = {}
    ref_arg_data = []

    for arg in impl_arg_info:
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            if arg.offset_for_name:
                continue

            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            ref_args[arg.name] = arg_value

            ref_arg_data.append(None)

        elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \
                or arg.arg_class is ConstantArg:
            if arg.shape is None or any(saxis is None for saxis in arg.shape):
                raise LoopyError(
                    "array '%s' needs known shape to use automatic "
                    "testing" % arg.name)

            shape = evaluate_shape(arg.unvec_shape, parameters)
            dtype = kernel_arg.dtype

            is_output = arg.base_name in kernel.get_written_variables()

            if arg.arg_class is ImageArg:
                storage_array = ary = cl_array.empty(queue,
                                                     shape,
                                                     dtype,
                                                     order="C")
                numpy_strides = None
                alloc_size = None
                strides = None
            else:
                strides = evaluate(arg.unvec_strides, parameters)

                alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1
                                 for alen, astrd in zip(shape, strides)) + 1

                if dtype is None:
                    raise LoopyError("dtype for argument '%s' is not yet "
                                     "known. Perhaps you want to use "
                                     "loopy.add_dtypes "
                                     "or loopy.infer_argument_dtypes?" %
                                     arg.name)

                itemsize = dtype.itemsize
                numpy_strides = [itemsize * s for s in strides]

                storage_array = cl_array.empty(queue, alloc_size, dtype)

            if is_output and arg.arg_class is ImageArg:
                raise LoopyError("write-mode images not supported in "
                                 "automatic testing")

            fill_rand(storage_array)

            if arg.arg_class is ImageArg:
                # must be contiguous
                pre_run_ary = pre_run_storage_array = storage_array.copy()

                ref_args[arg.name] = cl.image_from_array(
                    queue.context, ary.get())
            else:
                pre_run_storage_array = storage_array.copy()

                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
                pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape,
                                                  numpy_strides)
                ref_args[arg.name] = ary

            ref_arg_data.append(
                TestArgInfo(name=arg.name,
                            ref_array=ary,
                            ref_storage_array=storage_array,
                            ref_pre_run_array=pre_run_ary,
                            ref_pre_run_storage_array=pre_run_storage_array,
                            ref_shape=shape,
                            ref_strides=strides,
                            ref_alloc_size=alloc_size,
                            ref_numpy_strides=numpy_strides,
                            needs_checking=is_output))

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type %s not understood" % type(arg))

    return ref_args, ref_arg_data
示例#7
0
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    args = {}
    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            args[arg.name] = arg_value

        elif arg.arg_class is ImageArg:
            if arg.name in kernel.get_written_variables():
                raise NotImplementedError("write-mode images not supported in "
                                          "automatic testing")

            shape = evaluate_shape(arg.unvec_shape, parameters)
            assert shape == arg_desc.ref_shape

            # must be contiguous
            args[arg.name] = cl.image_from_array(
                queue.context, arg_desc.ref_pre_run_array.get())

        elif arg.arg_class is ArrayArg or\
                arg.arg_class is ConstantArg:
            shape = evaluate(arg.unvec_shape, parameters)
            strides = evaluate(arg.unvec_strides, parameters)

            dtype = kernel_arg.dtype
            itemsize = dtype.itemsize
            numpy_strides = [itemsize * s for s in strides]

            alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1
                             for alen, astrd in zip(shape, strides)) + 1

            # use contiguous array to transfer to host
            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()

            # use device shape/strides
            from pyopencl.compyte.array import as_strided
            host_ref_array = as_strided(host_ref_contig_array,
                                        arg_desc.ref_shape,
                                        arg_desc.ref_numpy_strides)

            # flatten the thing
            host_ref_flat_array = host_ref_array.flatten()

            # create host array with test shape (but not strides)
            host_contig_array = np.empty(shape, dtype=dtype)

            common_len = min(len(host_ref_flat_array),
                             len(host_contig_array.ravel()))
            host_contig_array.ravel()[:common_len] = \
                    host_ref_flat_array[:common_len]

            # create host array with test shape and storage layout
            host_storage_array = np.empty(alloc_size, dtype)
            host_array = as_strided(host_storage_array, shape, numpy_strides)
            host_array[...] = host_contig_array

            host_contig_array = arg_desc.ref_storage_array.get()
            storage_array = cl_array.to_device(queue, host_storage_array)
            ary = cl_array.as_strided(storage_array, shape, numpy_strides)

            args[arg.name] = ary

            arg_desc.test_storage_array = storage_array
            arg_desc.test_array = ary
            arg_desc.test_shape = shape
            arg_desc.test_strides = strides
            arg_desc.test_numpy_strides = numpy_strides
            arg_desc.test_alloc_size = alloc_size

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return args
示例#8
0
文件: check.py 项目: yueyedeai/loopy
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
                        GroupIndexTag)

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
                gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1)
                altags = kernel.iname_tags_of_type(
                        iname, AutoLocalIndexTagBase, max_num=1)

                if ltags:
                    tag, = ltags
                    local_axes_used.add(tag.axis)
                elif gtags:
                    tag, = gtags
                    group_axes_used.add(tag.axis)
                elif altags:
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
示例#9
0
文件: check.py 项目: yueyedeai/loopy
def check_loop_priority_inames_known(kernel):
    for prio in kernel.loop_priority:
        for iname in prio:
            if iname not in kernel.all_inames():
                raise LoopyError("unknown iname '%s' in loop priorities" % iname)
示例#10
0
    def __init__(self,
                 assignees,
                 expression,
                 id=None,
                 depends_on=None,
                 depends_on_is_final=None,
                 groups=None,
                 conflicts_with_groups=None,
                 no_sync_with=None,
                 within_inames_is_final=None,
                 within_inames=None,
                 boostable=None,
                 boostable_into=None,
                 tags=None,
                 temp_var_types=None,
                 priority=0,
                 predicates=frozenset(),
                 insn_deps=None,
                 insn_deps_is_final=None,
                 forced_iname_deps=None,
                 forced_iname_deps_is_final=None):

        super(CallInstruction, self).__init__(
            id=id,
            depends_on=depends_on,
            depends_on_is_final=depends_on_is_final,
            groups=groups,
            conflicts_with_groups=conflicts_with_groups,
            no_sync_with=no_sync_with,
            within_inames_is_final=within_inames_is_final,
            within_inames=within_inames,
            boostable=boostable,
            boostable_into=boostable_into,
            priority=priority,
            predicates=predicates,
            tags=tags,
            insn_deps=insn_deps,
            insn_deps_is_final=insn_deps_is_final,
            forced_iname_deps=forced_iname_deps,
            forced_iname_deps_is_final=forced_iname_deps_is_final)

        from pymbolic.primitives import Call
        from loopy.symbolic import Reduction
        if not isinstance(expression,
                          (Call, Reduction)) and expression is not None:
            raise LoopyError("'expression' argument to CallInstruction "
                             "must be a function call")

        from loopy.symbolic import parse
        if isinstance(assignees, str):
            assignees = parse(assignees)
        if not isinstance(assignees, tuple):
            raise LoopyError("'assignees' argument to CallInstruction "
                             "must be a tuple or a string parseable to a tuple"
                             "--got '%s'" % type(assignees).__name__)

        if isinstance(expression, str):
            expression = parse(expression)

        from pymbolic.primitives import Variable, Subscript
        from loopy.symbolic import LinearSubscript
        for assignee in assignees:
            if not isinstance(assignee,
                              (Variable, Subscript, LinearSubscript)):
                raise LoopyError("invalid lvalue '%s'" % assignee)

        self.assignees = assignees
        self.expression = expression

        if temp_var_types is None:
            self.temp_var_types = (None, ) * len(self.assignees)
        else:
            self.temp_var_types = temp_var_types
示例#11
0
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
    assert isinstance(kernel, LoopKernel)
    # FIXME: Does not understand subst rules for now
    if kernel.substitutions:
        from loopy.transform.subst import expand_subst
        kernel = expand_subst(kernel)

    if var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    elif var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    else:
        raise NameError("array '%s' was not found" % var_name)

    # {{{ check/normalize vary_by_axes

    if isinstance(vary_by_axes, str):
        vary_by_axes = vary_by_axes.split(",")

    from loopy.kernel.array import ArrayBase
    if isinstance(var_descr, ArrayBase):
        if var_descr.dim_names is not None:
            name_to_index = {
                name: idx
                for idx, name in enumerate(var_descr.dim_names)
            }
        else:
            name_to_index = {}

        def map_ax_name_to_index(ax):
            if isinstance(ax, str):
                try:
                    return name_to_index[ax]
                except KeyError:
                    raise LoopyError("axis name '%s' not understood " % ax)
            else:
                return ax

        vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes]

        if (vary_by_axes
                and (min(vary_by_axes) < 0
                     or max(vary_by_axes) > var_descr.num_user_axes())):
            raise LoopyError("vary_by_axes refers to out-of-bounds axis index")

    # }}}

    from pymbolic.mapper.substitutor import make_subst_func
    from pymbolic.primitives import (Sum, Product, is_zero, flattened_sum,
                                     flattened_product, Subscript, Variable)
    from loopy.symbolic import (get_dependencies, SubstitutionMapper,
                                UnidirectionalUnifier)

    # {{{ common factor key list maintenance

    # list of (index_key, common factors found)
    common_factors = []

    def find_unifiable_cf_index(index_key):
        for i, (key, _val) in enumerate(common_factors):
            unif = UnidirectionalUnifier(
                lhs_mapping_candidates=get_dependencies(key))

            unif_result = unif(key, index_key)

            if unif_result:
                assert len(unif_result) == 1
                return i, unif_result[0]

        return None, None

    def extract_index_key(access_expr):
        if isinstance(access_expr, Variable):
            return ()

        elif isinstance(access_expr, Subscript):
            index = access_expr.index_tuple
            return tuple(index[ax] for ax in vary_by_axes)
        else:
            raise ValueError("unexpected type of access_expr")

    def is_assignee(insn):
        return var_name in insn.assignee_var_names()

    def iterate_as(cls, expr):
        if isinstance(expr, cls):
            yield from expr.children
        else:
            yield expr

    # }}}

    # {{{ find common factors

    from loopy.kernel.data import Assignment

    for insn in kernel.instructions:
        if not is_assignee(insn):
            continue

        if not isinstance(insn, Assignment):
            raise LoopyError("'%s' modified by non-single-assignment" %
                             var_name)

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            # {{{ doesn't exist yet

            assert unif_result is None

            my_common_factors = None

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                         "in RHS of instruction '%s'" %
                                         (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                if my_common_factors is None:
                    my_common_factors = product_parts
                else:
                    my_common_factors = my_common_factors & product_parts

            if my_common_factors is not None:
                common_factors.append((index_key, my_common_factors))

            # }}}
        else:
            # {{{ match, filter existing common factors

            _, my_common_factors = common_factors[cf_index]

            unif_subst_map = SubstitutionMapper(
                make_subst_func(unif_result.lmap))

            for term in iterate_as(Sum, rhs):
                if term == lhs:
                    continue

                for part in iterate_as(Product, term):
                    if var_name in get_dependencies(part):
                        raise LoopyError("unexpected dependency on '%s' "
                                         "in RHS of instruction '%s'" %
                                         (var_name, insn.id))

                product_parts = set(iterate_as(Product, term))

                my_common_factors = {
                    cf
                    for cf in my_common_factors
                    if unif_subst_map(cf) in product_parts
                }

            common_factors[cf_index] = (index_key, my_common_factors)

            # }}}

    # }}}

    common_factors = [(ik, cf) for ik, cf in common_factors if cf]

    if not common_factors:
        raise LoopyError("no common factors found")

    # {{{ remove common factors

    new_insns = []

    for insn in kernel.instructions:
        if not isinstance(insn, Assignment) or not is_assignee(insn):
            new_insns.append(insn)
            continue

        index_key = extract_index_key(insn.assignee)

        lhs = insn.assignee
        rhs = insn.expression

        if is_zero(rhs):
            new_insns.append(insn)
            continue

        index_key = extract_index_key(lhs)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        if cf_index is None:
            new_insns.append(insn)
            continue

        _, my_common_factors = common_factors[cf_index]

        unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap))

        mapped_my_common_factors = {
            unif_subst_map(cf)
            for cf in my_common_factors
        }

        new_sum_terms = []

        for term in iterate_as(Sum, rhs):
            if term == lhs:
                new_sum_terms.append(term)
                continue

            new_sum_terms.append(
                flattened_product([
                    part for part in iterate_as(Product, term)
                    if part not in mapped_my_common_factors
                ]))

        new_insns.append(insn.copy(expression=flattened_sum(new_sum_terms)))

    # }}}

    # {{{ substitute common factors into usage sites

    def find_substitution(expr):
        if isinstance(expr, Subscript):
            v = expr.aggregate.name
        elif isinstance(expr, Variable):
            v = expr.name
        else:
            return expr

        if v != var_name:
            return expr

        index_key = extract_index_key(expr)
        cf_index, unif_result = find_unifiable_cf_index(index_key)

        unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap))

        _, my_common_factors = common_factors[cf_index]

        if my_common_factors is not None:
            return flattened_product(
                [unif_subst_map(cf) for cf in my_common_factors] + [expr])
        else:
            return expr

    insns = new_insns
    new_insns = []

    subm = SubstitutionMapper(find_substitution)

    for insn in insns:
        if not isinstance(insn, Assignment) or is_assignee(insn):
            new_insns.append(insn)
            continue

        new_insns.append(insn.with_transformed_expressions(subm))

    # }}}

    return kernel.copy(instructions=new_insns)
示例#12
0
        def gen_decls(name_suffix, shape, strides, unvec_shape, unvec_strides,
                      stride_arg_axes, dtype, user_index):
            """
            :arg unvec_shape: shape tuple
                that accounts for :class:`loopy.kernel.array.VectorArrayDimTag`
                in a scalar manner
            :arg unvec_strides: strides tuple
                that accounts for :class:`loopy.kernel.array.VectorArrayDimTag`
                in a scalar manner
            :arg stride_arg_axes: a tuple *(user_axis, impl_axis, unvec_impl_axis)*
            :arg user_index: A tuple representing a (user-facing)
                multi-dimensional subscript. This is filled in with
                concrete integers when known (such as for separate-array
                dim tags), and with *None* where the index won't be
                known until run time.
            """

            if dtype is None:
                dtype = self.dtype

            user_axis = len(user_index)

            num_user_axes = self.num_user_axes(require_answer=False)

            if num_user_axes is None or user_axis >= num_user_axes:
                # {{{ recursion base case

                full_name = self.name + name_suffix

                stride_args = []
                strides = list(strides)
                unvec_strides = list(unvec_strides)

                # generate stride arguments, yielded later to keep array first
                for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \
                        in stride_arg_axes:
                    stride_name = full_name + "_stride%d" % stride_user_axis

                    from pymbolic import var
                    strides[stride_impl_axis] = \
                            unvec_strides[stride_unvec_impl_axis] = \
                            var(stride_name)

                    stride_args.append(
                        ImplementedDataInfo(
                            target=target,
                            name=stride_name,
                            dtype=index_dtype,
                            arg_class=ValueArg,
                            stride_for_name_and_axis=(full_name,
                                                      stride_impl_axis),
                            is_written=False))

                yield ImplementedDataInfo(target=target,
                                          name=full_name,
                                          base_name=self.name,
                                          arg_class=type(self),
                                          dtype=dtype,
                                          shape=shape,
                                          strides=tuple(strides),
                                          unvec_shape=unvec_shape,
                                          unvec_strides=tuple(unvec_strides),
                                          allows_offset=bool(self.offset),
                                          is_written=is_written)

                import loopy as lp

                if self.offset is lp.auto:
                    offset_name = full_name + "_offset"
                    yield ImplementedDataInfo(target=target,
                                              name=offset_name,
                                              dtype=index_dtype,
                                              arg_class=ValueArg,
                                              offset_for_name=full_name,
                                              is_written=False)

                yield from stride_args

                # }}}

                return

            dim_tag = self.dim_tags[user_axis]

            if isinstance(dim_tag, FixedStrideArrayDimTag):
                if array_shape is None:
                    new_shape_axis = None
                else:
                    new_shape_axis = array_shape[user_axis]

                import loopy as lp
                if dim_tag.stride is lp.auto:
                    new_stride_arg_axes = stride_arg_axes \
                            + ((user_axis, len(strides), len(unvec_strides)),)

                    # repaired above when final array name is known
                    # (and stride argument is created)
                    new_stride_axis = None
                else:
                    new_stride_arg_axes = stride_arg_axes
                    new_stride_axis = dim_tag.stride

                yield from gen_decls(name_suffix, shape + (new_shape_axis, ),
                                     strides + (new_stride_axis, ),
                                     unvec_shape + (new_shape_axis, ),
                                     unvec_strides + (new_stride_axis, ),
                                     new_stride_arg_axes, dtype,
                                     user_index + (None, ))

            elif isinstance(dim_tag, SeparateArrayArrayDimTag):
                shape_i = array_shape[user_axis]
                if not is_integer(shape_i):
                    raise LoopyError("shape of '%s' has non-constant "
                                     "integer axis %d (0-based)" %
                                     (self.name, user_axis))

                for i in range(shape_i):
                    yield from gen_decls(name_suffix + "_s%d" % i, shape,
                                         strides, unvec_shape, unvec_strides,
                                         stride_arg_axes, dtype,
                                         user_index + (i, ))

            elif isinstance(dim_tag, VectorArrayDimTag):
                shape_i = array_shape[user_axis]
                if not is_integer(shape_i):
                    raise LoopyError("shape of '%s' has non-constant "
                                     "integer axis %d (0-based)" %
                                     (self.name, user_axis))

                yield from gen_decls(
                    name_suffix,
                    shape,
                    strides,
                    unvec_shape + (shape_i, ),
                    # vectors always have stride 1
                    unvec_strides + (1, ),
                    stride_arg_axes,
                    target.vector_dtype(dtype, shape_i),
                    user_index + (None, ))

            else:
                raise LoopyError(
                    "unsupported array dim implementation tag '%s' "
                    "in array '%s'" % (dim_tag, self.name))
示例#13
0
文件: data.py 项目: mmmika/loopy
    def __init__(self, name, dtype=None, shape=(), address_space=None,
            dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
            base_indices=None, storage_shape=None,
            base_storage=None, initializer=None, read_only=False,
            _base_storage_access_may_be_aliasing=False, **kwargs):
        """
        :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
        :arg shape: :class:`loopy.auto` or a shape tuple
        :arg base_indices: :class:`loopy.auto` or a tuple of base indices
        """

        scope = kwargs.pop("scope", None)
        if scope is not None:
            warn("Passing 'scope' is deprecated. Use 'address_space' instead.",
                    DeprecationWarning, stacklevel=2)

            if address_space is not None:
                raise ValueError("only one of 'scope' and 'address_space' "
                        "may be specified")
            else:
                address_space = scope

        del scope

        if address_space is None:
            address_space = auto

        if address_space is None:
            raise LoopyError(
                    "temporary variable '%s': "
                    "address_space must not be None"
                    % name)

        if initializer is None:
            pass
        elif isinstance(initializer, np.ndarray):
            if offset != 0:
                raise LoopyError(
                        "temporary variable '%s': "
                        "offset must be 0 if initializer specified"
                        % name)

            from loopy.types import NumpyType, to_loopy_type
            if dtype is auto or dtype is None:
                dtype = NumpyType(initializer.dtype)
            elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype):
                raise LoopyError(
                        "temporary variable '%s': "
                        "dtype of initializer does not match "
                        "dtype of array."
                        % name)

            if shape is auto:
                shape = initializer.shape

        else:
            raise LoopyError(
                    "temporary variable '%s': "
                    "initializer must be None or a numpy array"
                    % name)

        if order is None:
            order = "C"

        if base_indices is None:
            base_indices = (0,) * len(shape)

        if not read_only and initializer is not None:
            raise LoopyError(
                    "temporary variable '%s': "
                    "read-write variables with initializer "
                    "are not currently supported "
                    "(did you mean to set read_only=True?)"
                    % name)

        if base_storage is not None and initializer is not None:
            raise LoopyError(
                    "temporary variable '%s': "
                    "base_storage and initializer are "
                    "mutually exclusive"
                    % name)

        if base_storage is None and _base_storage_access_may_be_aliasing:
            raise LoopyError(
                    "temporary variable '%s': "
                    "_base_storage_access_may_be_aliasing option, but no "
                    "base_storage given!"
                    % name)

        ArrayBase.__init__(self, name=intern(name),
                dtype=dtype, shape=shape, strides=strides,
                dim_tags=dim_tags, offset=offset, dim_names=dim_names,
                order=order,
                base_indices=base_indices,
                address_space=address_space,
                storage_shape=storage_shape,
                base_storage=base_storage,
                initializer=initializer,
                read_only=read_only,
                _base_storage_access_may_be_aliasing=(
                    _base_storage_access_may_be_aliasing),
                **kwargs)
示例#14
0
 def map_local_hw_index(self, expr, enclosing_prec, type_context):
     raise LoopyError("plain C does not have group hw axes")
示例#15
0
    def map_call(self, expr, enclosing_prec, type_context):
        from pymbolic.primitives import Variable, Subscript
        from pymbolic.mapper.stringifier import PREC_NONE

        identifier = expr.function

        # {{{ implement indexof, indexof_vec

        if identifier.name in ["indexof", "indexof_vec"]:
            if len(expr.parameters) != 1:
                raise LoopyError("%s takes exactly one argument" %
                                 identifier.name)
            arg, = expr.parameters
            if not isinstance(arg, Subscript):
                raise LoopyError("argument to %s must be a subscript" %
                                 identifier.name)

            ary = self.find_array(arg)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate
            access_info = get_access_info(
                self.kernel.target, ary, arg.index,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

            from loopy.kernel.data import ImageArg
            if isinstance(ary, ImageArg):
                raise LoopyError("%s does not support images" %
                                 identifier.name)

            if identifier.name == "indexof":
                return access_info.subscripts[0]
            elif identifier.name == "indexof_vec":
                from loopy.kernel.array import VectorArrayDimTag
                ivec = None
                for iaxis, dim_tag in enumerate(ary.dim_tags):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        ivec = iaxis

                if ivec is None:
                    return access_info.subscripts[0]
                else:
                    return (access_info.subscripts[0] * ary.shape[ivec] +
                            access_info.vector_index)

            else:
                raise RuntimeError("should not get here")

        # }}}

        if isinstance(identifier, Variable):
            identifier = identifier.name

        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)

        str_parameters = None

        mangle_result = self.kernel.mangle_function(
            identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder)

        if mangle_result is None:
            raise RuntimeError(
                "function '%s' unknown--"
                "maybe you need to register a function mangler?" % identifier)

        if len(mangle_result.result_dtypes) != 1:
            raise LoopyError(
                "functions with more or fewer than one return value "
                "may not be used in an expression")

        if mangle_result.arg_dtypes is not None:
            str_parameters = [
                self.rec(par, PREC_NONE,
                         dtype_to_type_context(self.kernel.target, tgt_dtype),
                         tgt_dtype) for par, par_dtype, tgt_dtype in
                zip(expr.parameters, par_dtypes, mangle_result.arg_dtypes)
            ]

        else:
            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
            # propagate the type context here. But for many others, it does
            # not. Using the inferred type as a stopgap for now.
            str_parameters = [
                self.rec(par,
                         PREC_NONE,
                         type_context=dtype_to_type_context(
                             self.kernel.target, par_dtype))
                for par, par_dtype in zip(expr.parameters, par_dtypes)
            ]

            from warnings import warn
            warn(
                "Calling function '%s' with unknown C signature--"
                "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning)

        from loopy.codegen import SeenFunction
        self.codegen_state.seen_functions.add(
            SeenFunction(identifier, mangle_result.target_name,
                         mangle_result.arg_dtypes or par_dtypes))

        return "%s(%s)" % (mangle_result.target_name,
                           ", ".join(str_parameters))
示例#16
0
def precompute(
        kernel,
        subst_use,
        sweep_inames=[],
        within=None,
        storage_axes=None,
        temporary_name=None,
        precompute_inames=None,
        precompute_outer_inames=None,
        storage_axis_to_tag={},

        # "None" is a valid value here, distinct from the default.
        default_tag=_not_provided,
        dtype=None,
        fetch_bounding_box=False,
        temporary_address_space=None,
        compute_insn_id=None,
        **kwargs):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

        The following objects may be given for *subst_use*:

        * The name of the substitution rule.

        * The tagged name ("name$tag") of the substitution rule.

        * A list of invocations of the substitution rule.
          This list of invocations, when swept across *sweep_inames*, then serves
          to define the footprint of the precomputation.

          Invocations may be tagged ("name$tag") to filter out a subset of the
          usage sites of the substitution rule. (Namely those usage sites that
          use the same tagged name.)

          Invocations may be given as a string or as a
          :class:`pymbolic.primitives.Expression` object.

          If only one invocation is to be given, then the only entry of the list
          may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames to be swept.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg precompute_outer_inames: A :class:`frozenset` of inames within which
        the compute instruction is nested. If *None*, make an educated guess.
        May also be specified as a comma-separated string.

    :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the
        inames created to perform the precomputation. The current default will
        make them local axes and automatically split them to fit the work
        group size, but this default will disappear in favor of simply leaving them
        untagged in 2019. For 2018, a warning will be issued if no *default_tag* is
        specified.

    :arg compute_insn_id: The ID of the instruction generated to perform the
        precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ unify temporary_address_space / temporary_scope

    temporary_scope = kwargs.pop("temporary_scope", None)

    from loopy.kernel.data import AddressSpace
    if temporary_scope is not None:
        from warnings import warn
        warn(
            "temporary_scope is deprecated. Use temporary_address_space instead",
            DeprecationWarning,
            stacklevel=2)

        if temporary_address_space is not None:
            raise LoopyError(
                "may not specify both temporary_address_space and "
                "temporary_scope")

        temporary_address_space = temporary_scope

    del temporary_scope

    # }}}

    if kwargs:
        raise TypeError("unrecognized keyword arguments: %s" %
                        ", ".join(kwargs.keys()))

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname" % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [
            iname.strip() for iname in precompute_inames.split(",")
        ]

    if isinstance(precompute_outer_inames, str):
        precompute_outer_inames = frozenset(
            iname.strip() for iname in precompute_outer_inames.split(","))

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                                 "on rule name and tag")

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    try:
        subst = kernel.substitutions[subst_name]
    except KeyError:
        raise LoopyError("substitution rule '%s' not found" % subst_name)

    c_subst_name = subst_name.replace(".", "_")

    # {{{ handle default_tag

    from loopy.transform.data import _not_provided \
            as transform_data_not_provided

    if default_tag is _not_provided or default_tag is transform_data_not_provided:
        # no need to warn for scalar precomputes
        if sweep_inames:
            from warnings import warn
            warn(
                "Not specifying default_tag is deprecated, and default_tag "
                "will become mandatory in 2019.x. "
                "Pass 'default_tag=\"l.auto\" to match the current default, "
                "or Pass 'default_tag=None to leave the loops untagged, which "
                "is the recommended behavior.",
                DeprecationWarning,
                stacklevel=(

                    # In this case, we came here through add_prefetch. Increase
                    # the stacklevel.
                    3 if default_tag is transform_data_not_provided else 2))

        default_tag = "l.auto"

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    # }}}

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                                 "be substitution rule invocation")

            access_descriptors.append(
                RuleAccessDescriptor(identifier=access_descriptor_id(
                    args, None),
                                     args=args))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name,
                                      subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.MultiAssignmentBase):
                for assignee in insn.assignees:
                    invg(assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = (get_dependencies(submap(subst.expression)) -
                    frozenset(subst.arguments)) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: " +
                           ", ".join(extra_storage_axes - value_inames -
                                     expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError(
                    "must specify a sufficient number of "
                    "storage_axes to uniquely determine the meaning "
                    "of the given precompute_inames. (%d storage_axes "
                    "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (set(precompute_inames)
                                         & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                                   "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
        accdesc.copy(storage_axis_exprs=storage_axis_exprs(
            storage_axis_sources, accdesc.args))
        for accdesc in access_descriptors
    ]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(
                        iname) != domch.leaf_domain_index:
                    raise RuntimeError(
                        "sweep iname '%s' is not 'at home' in the "
                        "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                               access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError(
                        "precompute axis %d (1-based) was "
                        "eliminated as "
                        "having length 1 but also mapped to existing "
                        "iname '%s'" % (i + 1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
            iname + "'" for iname in non1_storage_axis_names
        ]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain,
            primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                  dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                    dim_idx)

                mod_domain = mod_domain.add_constraint(
                    isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        def add_assumptions(d):
            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
            assumptions, domain = isl.align_two(assumption_non_param, d)
            return assumptions & domain

        # {{{ check that we got the desired domain

        check_domain = add_assumptions(
            check_domain.project_out_except(primed_non1_saxis_names,
                                            [isl.dim_type.set]))

        mod_check_domain = add_assumptions(mod_domain)

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(
                dt, dim_idx, saxis + "'")

        mod_check_domain = mod_check_domain.project_out_except(
            primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                             "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = add_assumptions(
            mod_domain.project_out_except(orig_domain_inames,
                                          [isl.dim_type.set]))

        check_domain = add_assumptions(domch.domain)

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError(
                "original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee[tuple(
            var(iname) for iname in non1_storage_axis_names)]

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[prior_storage_axis_name_dict.get(
            arg_name, arg_name)] = arg + bi

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())

    from loopy.match import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
        rule_mapping_context,
        make_subst_func(storage_axis_subst_dict),
        within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name)

    compute_insn = Assignment(
        id=compute_insn_id,
        assignee=assignee,
        expression=compute_expression,
        # within_inames determined below
    )
    compute_dep_id = compute_insn_id
    added_compute_insns = [compute_insn]

    if temporary_address_space == AddressSpace.GLOBAL:
        barrier_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name + "_barrier")
        from loopy.kernel.instruction import BarrierInstruction
        barrier_insn = BarrierInstruction(id=barrier_insn_id,
                                          depends_on=frozenset(
                                              [compute_insn_id]),
                                          synchronization_kind="global",
                                          mem_kind="global")
        compute_dep_id = barrier_insn_id

        added_compute_insns.append(barrier_insn)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    from loopy.symbolic import SubstitutionRuleExpander
    expander = SubstitutionRuleExpander(kernel.substitutions)

    invr = RuleInvocationReplacer(rule_mapping_context,
                                  subst_name,
                                  subst_tag,
                                  within,
                                  access_descriptors,
                                  abm,
                                  storage_axis_names,
                                  storage_axis_sources,
                                  non1_storage_axis_names,
                                  temporary_name,
                                  compute_insn_id,
                                  compute_dep_id,
                                  compute_read_variables=get_dependencies(
                                      expander(compute_expression)))

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(instructions=added_compute_insns +
                         kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ add dependencies to compute insn

    kernel = kernel.copy(instructions=[
        insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn.
        id == compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ propagate storage iname subst to dependencies of compute instructions

    from loopy.kernel.tools import find_recursive_dependencies
    compute_deps = find_recursive_dependencies(kernel,
                                               frozenset([compute_insn_id]))

    # FIXME: Need to verify that there are no outside dependencies
    # on compute_deps

    prior_storage_axis_names = frozenset(storage_axis_subst_dict)

    new_insns = []
    for insn in kernel.instructions:
        if (insn.id in compute_deps
                and insn.within_inames & prior_storage_axis_names):
            insn = (insn.with_transformed_expressions(
                lambda expr: expr_subst_map(expr, kernel, insn)).copy(
                    within_inames=frozenset(
                        storage_axis_subst_dict.get(iname, var(iname)).name
                        for iname in insn.within_inames)))

            new_insns.append(insn)
        else:
            new_insns.append(insn)

    kernel = kernel.copy(instructions=new_insns)

    # }}}

    # {{{ determine inames for compute insn

    if precompute_outer_inames is None:
        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
        precompute_outer_inames = (
            frozenset(non1_storage_axis_names)
            | frozenset((expanding_usage_arg_deps | value_inames) -
                        sweep_inames_set)
            | guess_iname_deps_based_on_var_use(kernel, compute_insn))
    else:
        if not isinstance(precompute_outer_inames, frozenset):
            raise TypeError("precompute_outer_inames must be a frozenset")

        precompute_outer_inames = precompute_outer_inames \
                | frozenset(non1_storage_axis_names)

    kernel = kernel.copy(instructions=[
        insn.copy(within_inames=precompute_outer_inames) if insn.id ==
        compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is not None:
        dtype = np.dtype(dtype)

    if temporary_address_space is None:
        temporary_address_space = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
            name=temporary_name,
            dtype=dtype,
            base_indices=(0, ) * len(new_temp_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_address_space,
            dim_names=tuple(non1_storage_axis_names))

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                                 "do not match (existing: %s, new: %s)" %
                                 (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0, ) * len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
            max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_address_space == temp_var.address_space:
            pass
        elif temporary_address_space is lp.auto:
            temporary_address_space = temp_var.address_space
        elif temp_var.address_space is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                             "have matching scopes (existing: %s, new: %s)" %
                             (temporary_name,
                              AddressSpace.stringify(temp_var.address_space),
                              AddressSpace.stringify(temporary_address_space)))

        temp_var = temp_var.copy(address_space=temporary_address_space)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type

    if filter_iname_tags_by_type(new_iname_to_tag.values(),
                                 AutoFitLocalIndexTag):
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel
示例#17
0
文件: __init__.py 项目: shigh/loopy
    def emit_multiple_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        from pymbolic.primitives import Variable
        from pymbolic.mapper.stringifier import PREC_NONE

        func_id = insn.expression.function
        parameters = insn.expression.parameters

        if isinstance(func_id, Variable):
            func_id = func_id.name

        assignee_var_descriptors = [
            codegen_state.kernel.get_var_descriptor(a)
            for a in insn.assignee_var_names()
        ]

        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)

        mangle_result = codegen_state.kernel.mangle_function(
            func_id, par_dtypes)
        if mangle_result is None:
            raise RuntimeError(
                "function '%s' unknown--"
                "maybe you need to register a function mangler?" % func_id)

        assert mangle_result.arg_dtypes is not None

        from loopy.expression import dtype_to_type_context
        c_parameters = [
            ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype),
                tgt_dtype).expr for par, par_dtype, tgt_dtype in zip(
                    parameters, par_dtypes, mangle_result.arg_dtypes)
        ]

        from loopy.codegen import SeenFunction
        codegen_state.seen_functions.add(
            SeenFunction(func_id, mangle_result.target_name,
                         mangle_result.arg_dtypes))

        from pymbolic import var
        for i, (a, tgt_dtype) in enumerate(
                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
            if tgt_dtype != ecm.infer_type(a):
                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
                                 "side of instruction '%s'" % (i + 1, insn.id))
            c_parameters.append(
                # TODO Yuck: The "where-at function": &(...)
                var("&")(ecm(a, PREC_NONE,
                             dtype_to_type_context(self.target, tgt_dtype),
                             tgt_dtype).expr))

        from pymbolic import var
        result = var(mangle_result.target_name)(*c_parameters)

        # In case of no assignees, we are done
        if len(mangle_result.result_dtypes) == 0:
            from cgen import ExpressionStatement
            return ExpressionStatement(
                CExpression(self.get_c_expression_to_code_mapper(), result))

        result = ecm.wrap_in_typecast(mangle_result.result_dtypes[0],
                                      assignee_var_descriptors[0].dtype,
                                      result)

        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)

        from cgen import Assign
        return Assign(
            lhs_code,
            CExpression(self.get_c_expression_to_code_mapper(), result))
示例#18
0
def get_access_info(target, ary, index, eval_expr, vectorization_info):
    """
    :arg ary: an object of type :class:`ArrayBase`
    :arg index: a tuple of indices representing a subscript into ary
    :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`,
        or *None*.
    """

    import loopy as lp
    from pymbolic import var

    def eval_expr_assert_integer_constant(i, expr):
        from pymbolic.mapper.evaluator import UnknownVariableError
        try:
            result = eval_expr(expr)
        except UnknownVariableError as e:
            raise LoopyError(
                "When trying to index the array '%s' along axis "
                "%d (tagged '%s'), the index was not a compile-time "
                "constant (but it has to be in order for code to be "
                "generated). You likely want to unroll the iname(s) '%s'." %
                (ary.name, i, ary.dim_tags[i], str(e)))

        if not is_integer(result):
            raise LoopyError("subscript '%s[%s]' has non-constant "
                             "index for separate-array axis %d (0-based)" %
                             (ary.name, index, i))

        return result

    def apply_offset(sub):
        import loopy as lp

        if ary.offset:
            if ary.offset is lp.auto:
                return var(array_name + "_offset") + sub
            elif isinstance(ary.offset, str):
                return var(ary.offset) + sub
            else:
                # assume it's an expression
                return ary.offset + sub
        else:
            return sub

    if not isinstance(index, tuple):
        index = (index, )

    array_name = ary.name

    if ary.dim_tags is None:
        if len(index) != 1:
            raise LoopyError(
                "Array '%s' has no known axis implementation "
                "tags and therefore only supports one-dimensional "
                "indexing. (Did you mean 'shape=loopy.auto' instead of "
                "'shape=None'?)" % ary.name)

        return AccessInfo(array_name=array_name,
                          subscripts=(apply_offset(index[0]), ),
                          vector_index=None)

    if len(ary.dim_tags) != len(index):
        raise LoopyError("subscript to '%s[%s]' has the wrong "
                         "number of indices (got: %d, expected: %d)" %
                         (ary.name, index, len(index), len(ary.dim_tags)))

    num_target_axes = ary.num_target_axes()

    vector_index = None
    subscripts = [0] * num_target_axes

    vector_size = ary.vector_size(target)

    # {{{ process separate-array dim tags first, to find array name

    for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
        if isinstance(dim_tag, SeparateArrayArrayDimTag):
            idx = eval_expr_assert_integer_constant(i, idx)
            array_name += "_s%d" % idx

    # }}}

    # {{{ process remaining dim tags

    for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
        if isinstance(dim_tag, FixedStrideArrayDimTag):
            stride = dim_tag.stride

            if is_integer(stride):
                if not dim_tag.stride % vector_size == 0:
                    raise LoopyError(
                        "array '%s' has axis %d stride of "
                        "%d, which is not divisible by the size of the "
                        "vector (%d)" %
                        (ary.name, i, dim_tag.stride, vector_size))

            elif stride is lp.auto:
                stride = var(array_name + "_stride%d" % i)

            subscripts[dim_tag.target_axis] += (stride // vector_size) * idx

        elif isinstance(dim_tag, SeparateArrayArrayDimTag):
            pass

        elif isinstance(dim_tag, VectorArrayDimTag):
            from pymbolic.primitives import Variable
            if (vectorization_info is not None
                    and isinstance(index[i], Variable)
                    and index[i].name == vectorization_info.iname):
                # We'll do absolutely nothing here, which will result
                # in the vector being returned.
                pass

            else:
                idx = eval_expr_assert_integer_constant(i, idx)

                assert vector_index is None
                vector_index = idx

        else:
            raise LoopyError("unsupported array dim implementation tag '%s' "
                             "in array '%s'" % (dim_tag, ary.name))

    # }}}

    from pymbolic import var
    import loopy as lp
    if ary.offset:
        if num_target_axes > 1:
            raise NotImplementedError("offsets for multiple image axes")

        subscripts[0] = apply_offset(subscripts[0])

    return AccessInfo(array_name=array_name,
                      vector_index=vector_index,
                      subscripts=subscripts)
示例#19
0
def privatize_temporaries_with_inames(kernel,
                                      privatizing_inames,
                                      only_var_names=None):
    """This function provides each loop iteration of the *privatizing_inames*
    with its own private entry in the temporaries it accesses (possibly
    restricted to *only_var_names*).

    This is accomplished implicitly as part of generating instruction-level
    parallelism by the "ILP" tag and accessible separately through this
    transformation.

    Example::

        for imatrix, i
            acc = 0
            for k
                acc = acc + a[imatrix, i, k] * vec[k]
            end
        end

    might become::

        for imatrix, i
            acc[imatrix] = 0
            for k
                acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k]
            end
        end

    facilitating loop interchange of the *imatrix* loop.
    .. versionadded:: 2018.1
    """

    if isinstance(privatizing_inames, str):
        privatizing_inames = frozenset(s.strip()
                                       for s in privatizing_inames.split(","))

    if isinstance(only_var_names, str):
        only_var_names = frozenset(s.strip()
                                   for s in only_var_names.split(","))

    wmap = kernel.writer_map()

    var_to_new_priv_axis_iname = {}

    # {{{ find variables that need extra indices

    for tv in kernel.temporary_variables.values():
        if only_var_names is not None and tv.name not in only_var_names:
            continue

        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            priv_axis_inames = writer_insn.within_inames & privatizing_inames

            referenced_priv_axis_inames = (
                priv_axis_inames
                & writer_insn.write_dependency_names())

            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames

            if not new_priv_axis_inames:
                break

            if tv.name in var_to_new_priv_axis_iname:
                if new_priv_axis_inames != set(
                        var_to_new_priv_axis_iname[tv.name]):
                    raise LoopyError(
                        "instruction '%s' requires adding "
                        "indices for privatizing var '%s' on iname(s) '%s', "
                        "but previous instructions required inames '%s'" %
                        (writer_insn_id, tv.name,
                         ", ".join(new_priv_axis_inames), ", ".join(
                             var_to_new_priv_axis_iname[tv.name])))

                continue

            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    priv_axis_iname_to_length = {}
    iname_to_lbound = {}
    for priv_axis_inames in var_to_new_priv_axis_iname.values():
        for iname in priv_axis_inames:
            if iname in priv_axis_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=False)
            priv_axis_iname_to_length[iname] = pw_aff_to_expr(
                static_max_of_pw_aff(bounds.size, constants_only=False))
            iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff)

    # }}}

    # {{{ change temporary variables

    from loopy.kernel.data import VectorizeTag

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in var_to_new_priv_axis_iname.items():
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(priv_axis_iname_to_length[iname]
                            for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if kernel.iname_tags_of_type(iname, VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(
            shape=shape + extra_shape,
            # Forget what you knew about data layout,
            # create from scratch.
            dim_tags=dim_tags,
            dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = {
        var_name: tuple(var(iname) for iname in inames)
        for var_name, inames in var_to_new_priv_axis_iname.items()
    }

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_priv_axis_inames <= insn.within_inames:
            raise LoopyError(
                "Kernel '%s': Instruction '%s': touched variable that "
                "(for privatization, e.g. as performed for ILP) "
                "required iname(s) '%s', but that the instruction was not "
                "previously within the iname(s). To remedy this, first promote"
                "the instruction into the iname." %
                (kernel.name, insn.id,
                 ", ".join(eiii.seen_priv_axis_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(temporary_variables=new_temp_vars,
                       instructions=new_insns)
示例#20
0
def _parse_array_dim_tag(tag, default_target_axis, nesting_levels):
    if isinstance(tag, ArrayDimImplementationTag):
        return False, False, tag

    if not isinstance(tag, str):
        raise TypeError("arg dimension implementation tag must be "
                        "string or tag object")

    tag = tag.strip()
    is_optional = False
    if tag.endswith("?"):
        tag = tag[:-1]
        is_optional = True

    orig_tag = tag

    if tag == "sep":
        return False, is_optional, SeparateArrayArrayDimTag()
    elif tag == "vec":
        return False, is_optional, VectorArrayDimTag()

    nesting_level_match = NESTING_LEVEL_RE.match(tag)

    if nesting_level_match is not None:
        nesting_level = int(nesting_level_match.group(1))
        tag = nesting_level_match.group(2)
        if tag is None:
            tag = ""
    else:
        nesting_level = None

    has_explicit_nesting_level = nesting_level is not None

    target_axis_match = TARGET_AXIS_RE.search(tag)

    if target_axis_match is not None:
        target_axis = int(target_axis_match.group(1))
        tag = tag[:target_axis_match.start()]
    else:
        target_axis = default_target_axis

    ta_nesting_levels = nesting_levels.get(target_axis, [])

    if tag.startswith("stride:"):
        fixed_stride_descr = tag[7:]
        if fixed_stride_descr.strip() == "auto":
            import loopy as lp
            return (has_explicit_nesting_level, is_optional,
                    FixedStrideArrayDimTag(lp.auto,
                                           target_axis,
                                           layout_nesting_level=nesting_level))
        else:
            from loopy.symbolic import parse
            return (has_explicit_nesting_level, is_optional,
                    FixedStrideArrayDimTag(parse(fixed_stride_descr),
                                           target_axis,
                                           layout_nesting_level=nesting_level))

    else:
        padded_stride_match = PADDED_STRIDE_TAG_RE.match(tag)
        if padded_stride_match is not None:
            tag = padded_stride_match.group(1)

            from loopy.symbolic import parse
            pad_to = parse(padded_stride_match.group(2))
        else:
            pad_to = None

        if tag in ["c", "C"]:
            if nesting_level is not None:
                raise LoopyError(
                    "may not specify 'C' array order with explicit "
                    "layout nesting level")

            if ta_nesting_levels:
                nesting_level = min(ta_nesting_levels) - 1
            else:
                nesting_level = 0

        elif tag in ["f", "F"]:
            if nesting_level is not None:
                raise LoopyError(
                    "may not specify 'C' array order with explicit "
                    "layout nesting level")

            if ta_nesting_levels:
                nesting_level = max(ta_nesting_levels) + 1
            else:
                nesting_level = 0

        elif tag == "":
            if nesting_level is None:
                raise LoopyError("invalid dim tag: '%s'" % orig_tag)

        else:
            raise LoopyError("invalid dim tag: '%s'" % orig_tag)

        return (has_explicit_nesting_level, is_optional,
                ComputedStrideArrayDimTag(nesting_level,
                                          pad_to=pad_to,
                                          target_axis=target_axis))
示例#21
0
    def __init__(self,
                 id,
                 depends_on,
                 depends_on_is_final,
                 groups,
                 conflicts_with_groups,
                 no_sync_with,
                 within_inames_is_final,
                 within_inames,
                 priority,
                 boostable,
                 boostable_into,
                 predicates,
                 tags,
                 insn_deps=None,
                 insn_deps_is_final=None,
                 forced_iname_deps=None,
                 forced_iname_deps_is_final=None):

        # {{{ backwards compatibility goop

        if depends_on is not None and insn_deps is not None:
            raise LoopyError("may not specify both insn_deps and depends_on")
        elif insn_deps is not None:
            warn("insn_deps is deprecated, use depends_on",
                 DeprecationWarning,
                 stacklevel=2)

            depends_on = insn_deps
            depends_on_is_final = insn_deps_is_final

        if forced_iname_deps is not None and within_inames is not None:
            raise LoopyError("may not specify both forced_iname_deps "
                             "and within_inames")
        elif forced_iname_deps is not None:
            warn("forced_iname_deps is deprecated, use within_inames",
                 DeprecationWarning,
                 stacklevel=2)

            within_inames = forced_iname_deps
            within_inames_is_final = forced_iname_deps_is_final

        if predicates is None:
            predicates = frozenset()

        new_predicates = set()
        for pred in predicates:
            if isinstance(pred, str):
                from pymbolic.primitives import LogicalNot
                from loopy.symbolic import parse
                if pred.startswith("!"):
                    warn("predicates starting with '!' are deprecated. "
                         "Simply use 'not' instead")
                    pred = LogicalNot(parse(pred[1:]))
                else:
                    pred = parse(pred)

            new_predicates.add(pred)

        predicates = frozenset(new_predicates)
        del new_predicates

        # }}}

        if depends_on is None:
            depends_on = frozenset()

        if groups is None:
            groups = frozenset()

        if conflicts_with_groups is None:
            conflicts_with_groups = frozenset()

        if no_sync_with is None:
            no_sync_with = frozenset()

        if within_inames is None:
            within_inames = frozenset()

        if within_inames_is_final is None:
            within_inames_is_final = False

        if isinstance(depends_on, str):
            depends_on = frozenset(s.strip() for s in depends_on.split(",")
                                   if s.strip())

        if depends_on_is_final is None:
            depends_on_is_final = False

        if depends_on_is_final and not isinstance(depends_on, frozenset):
            raise LoopyError("Setting depends_on_is_final to True requires "
                             "actually specifying depends_on")

        if tags is None:
            tags = frozenset()

        if priority is None:
            priority = 0

        if not isinstance(tags, frozenset):
            # was previously allowed to be tuple
            tags = frozenset(tags)

        # Periodically reenable these and run the tests to ensure all
        # performance-relevant identifiers are interned.
        #
        # from loopy.tools import is_interned
        # assert is_interned(id)
        # assert all(is_interned(dep) for dep in depends_on)
        # assert all(is_interned(grp) for grp in groups)
        # assert all(is_interned(grp) for grp in conflicts_with_groups)
        # assert all(is_interned(iname) for iname in within_inames)
        # assert all(is_interned(pred) for pred in predicates)

        assert isinstance(within_inames, frozenset)
        assert isinstance(depends_on, frozenset) or depends_on is None
        assert isinstance(groups, frozenset)
        assert isinstance(conflicts_with_groups, frozenset)

        ImmutableRecord.__init__(self,
                                 id=id,
                                 depends_on=depends_on,
                                 depends_on_is_final=depends_on_is_final,
                                 no_sync_with=no_sync_with,
                                 groups=groups,
                                 conflicts_with_groups=conflicts_with_groups,
                                 within_inames_is_final=within_inames_is_final,
                                 within_inames=within_inames,
                                 priority=priority,
                                 boostable=boostable,
                                 boostable_into=boostable_into,
                                 predicates=predicates,
                                 tags=tags)
示例#22
0
def parse_array_dim_tags(dim_tags,
                         n_axes=None,
                         use_increasing_target_axes=False,
                         dim_names=None):
    if isinstance(dim_tags, str):
        dim_tags = dim_tags.split(",")
    if isinstance(dim_tags, dict):
        dim_tags_dict = dim_tags

        if dim_names is None:
            raise LoopyError("dim_tags may only be given as a dictionary if "
                             "dim_names is available")

        assert n_axes == len(dim_names)

        dim_tags = [None] * n_axes
        for dim_name, val in dim_tags_dict.items():
            try:
                dim_idx = dim_names.index(dim_name)
            except ValueError:
                raise LoopyError("'%s' does not name an array axis" % dim_name)

            dim_tags[dim_idx] = val

        for idim, dim_tag in enumerate(dim_tags):
            if dim_tag is None:
                raise LoopyError(
                    "array axis tag for axis %d (1-based) was not "
                    "set by passed dictionary" % (idim + 1))

    default_target_axis = 0

    result = []

    # a mapping from target axes to used nesting levels
    nesting_levels = {}

    target_axis_to_has_explicit_nesting_level = {}

    for iaxis, dim_tag in enumerate(dim_tags):
        has_explicit_nesting_level, is_optional, parsed_dim_tag = (
            _parse_array_dim_tag(dim_tag, default_target_axis, nesting_levels))

        if (is_optional and n_axes is not None
                and len(result) + (len(dim_tags) - iaxis) > n_axes):
            continue

        if isinstance(parsed_dim_tag, _StrideArrayDimTagBase):
            # {{{ check for C/F mixed with explicit layout nesting level specs

            if (parsed_dim_tag.target_axis
                    in target_axis_to_has_explicit_nesting_level):
                if (has_explicit_nesting_level !=
                        target_axis_to_has_explicit_nesting_level[
                            parsed_dim_tag.target_axis]):
                    raise LoopyError(
                        "may not mix C/F dim_tag specifications with "
                        "explicit specification of layout nesting levels")
            else:
                target_axis_to_has_explicit_nesting_level[
                    parsed_dim_tag.target_axis] = has_explicit_nesting_level

            # }}}

            lnl = parsed_dim_tag.layout_nesting_level
            target_axis = parsed_dim_tag.target_axis
            if lnl is not None:
                if lnl in nesting_levels.get(target_axis, []):
                    raise LoopyError("layout nesting level %d is not unique"
                                     " in target axis %d" % (lnl, target_axis))

                nesting_levels.setdefault(target_axis, []) \
                        .append(parsed_dim_tag.layout_nesting_level)

        result.append(parsed_dim_tag)

        if use_increasing_target_axes:
            default_target_axis += 1

    # {{{ check contiguity of nesting levels

    for target_axis, ta_nesting_levels in nesting_levels.items():
        if sorted(ta_nesting_levels) != list(
                range(min(ta_nesting_levels),
                      min(ta_nesting_levels) + len(ta_nesting_levels))):
            raise LoopyError(
                "layout nesting levels '%s' "
                "for target axis %d not contiguous" %
                (",".join(str(nl) for nl in ta_nesting_levels), target_axis))

        ta_nesting_level_increment = -min(ta_nesting_levels)
        for i in range(len(result)):
            if (isinstance(result[i], _StrideArrayDimTagBase)
                    and result[i].target_axis == target_axis
                    and result[i].layout_nesting_level is not None):
                result[i] = result[i].copy(
                    layout_nesting_level=result[i].layout_nesting_level +
                    ta_nesting_level_increment)

    # }}}

    return result
示例#23
0
文件: check.py 项目: yueyedeai/loopy
    def map_subscript(self, expr):
        WalkMapper.map_subscript(self, expr)

        from pymbolic.primitives import Variable
        assert isinstance(expr.aggregate, Variable)

        shape = None
        var_name = expr.aggregate.name
        if var_name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[var_name]
            shape = arg.shape
        elif var_name in self.kernel.temporary_variables:
            tv = self.kernel.temporary_variables[var_name]
            shape = tv.shape

        if shape is not None:
            subscript = expr.index

            if not isinstance(subscript, tuple):
                subscript = (subscript,)

            from loopy.symbolic import (get_dependencies, get_access_range,
                    UnableToDetermineAccessRange)

            available_vars = set(self.domain.get_var_dict())
            shape_deps = set()
            for shape_axis in shape:
                if shape_axis is not None:
                    shape_deps.update(get_dependencies(shape_axis))

            if not (get_dependencies(subscript) <= available_vars
                    and shape_deps <= available_vars):
                return

            if len(subscript) != len(shape):
                raise LoopyError("subscript to '%s' in '%s' has the wrong "
                        "number of indices (got: %d, expected: %d)" % (
                            expr.aggregate.name, expr,
                            len(subscript), len(shape)))

            try:
                access_range = get_access_range(self.domain, subscript,
                        self.kernel.assumptions)
            except UnableToDetermineAccessRange:
                # Likely: index was non-affine, nothing we can do.
                return

            shape_domain = isl.BasicSet.universe(access_range.get_space())
            for idim in range(len(subscript)):
                shape_axis = shape[idim]

                if shape_axis is not None:
                    from loopy.isl_helpers import make_slab
                    slab = make_slab(
                            shape_domain.get_space(), (dim_type.in_, idim),
                            0, shape_axis)

                    shape_domain = shape_domain.intersect(slab)

            if not access_range.is_subset(shape_domain):
                raise LoopyError("'%s' in instruction '%s' "
                        "accesses out-of-bounds array element (could not"
                        " establish '%s' is a subset of '%s')."
                        % (expr, self.insn_id, access_range, shape_domain))
示例#24
0
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
                                       shape, dim_tags):

    # Just to clarify:
    #
    # - user axes are user-facing--what the user actually uses for indexing.
    #
    # - target axes are implementation facing. Normal in-memory arrays have one.
    #   3D images have three.

    import loopy as lp

    # {{{ pick apart arg dim tags into computed, fixed and vec

    vector_dim = None

    # a mapping from target axes to {layout_nesting_level: dim_tag_index}
    target_axis_to_nesting_level_map = {}

    for i, dim_tag in enumerate(dim_tags):
        if isinstance(dim_tag, VectorArrayDimTag):
            if vector_dim is not None:
                raise LoopyError("arg '%s' may only have one vector-tagged "
                                 "argument dimension" % name)

            vector_dim = i

        elif isinstance(dim_tag, _StrideArrayDimTagBase):
            if dim_tag.layout_nesting_level is None:
                continue

            nl_map = target_axis_to_nesting_level_map \
                    .setdefault(dim_tag.target_axis, {})
            assert dim_tag.layout_nesting_level not in nl_map
            nl_map[dim_tag.layout_nesting_level] = i

        elif isinstance(dim_tag, SeparateArrayArrayDimTag):
            pass

        else:
            raise LoopyError("invalid array dim tag")

    # }}}

    # {{{ convert computed to fixed stride dim tags

    new_dim_tags = dim_tags[:]

    for target_axis in range(num_target_axes):
        if vector_dim is None:
            stride_so_far = 1
        else:
            if shape is None or shape is lp.auto:
                # unable to normalize without known shape
                return None

            if not is_integer(shape[vector_dim]):
                raise TypeError(
                    "shape along vector axis %d of array '%s' "
                    "must be an integer, not an expression ('%s')" %
                    (vector_dim, name, shape[vector_dim]))

            stride_so_far = shape[vector_dim]
            # FIXME: OpenCL-specific
            if stride_so_far == 3:
                stride_so_far = 4

        nesting_level_map = target_axis_to_nesting_level_map.get(
            target_axis, {})
        nl_keys = sorted(nesting_level_map.keys())

        if not nl_keys:
            continue

        for key in nl_keys:
            dim_tag_index = nesting_level_map[key]
            dim_tag = dim_tags[dim_tag_index]

            if isinstance(dim_tag, ComputedStrideArrayDimTag):
                if stride_so_far is None:
                    raise LoopyError(
                        "unable to determine fixed stride "
                        "for axis %d because it is nested outside of "
                        "an 'auto' stride axis" % dim_tag_index)

                new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag(
                    stride_so_far,
                    target_axis=dim_tag.target_axis,
                    layout_nesting_level=dim_tag.layout_nesting_level)

                if shape is None or shape is lp.auto:
                    # unable to normalize without known shape
                    return None

                shape_axis = shape[dim_tag_index]
                if shape_axis is None:
                    stride_so_far = None
                else:
                    stride_so_far *= shape_axis

                if dim_tag.pad_to is not None:
                    from pytools import div_ceil
                    stride_so_far = (div_ceil(stride_so_far, dim_tag.pad_to) *
                                     stride_so_far)

            elif isinstance(dim_tag, FixedStrideArrayDimTag):
                stride_so_far = dim_tag.stride

                if stride_so_far is lp.auto:
                    stride_so_far = None

            else:
                raise TypeError("internal error in dim_tag conversion")

    # }}}

    return new_dim_tags
示例#25
0
文件: check.py 项目: yueyedeai/loopy
def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        from loopy.kernel.instruction import BarrierInstruction
        from loopy.kernel.data import LocalIndexTag
        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            non_lid_inames = frozenset(iname for iname in insn_inames
                if not kernel.iname_tags_of_type(iname, LocalIndexTag))
            insn_impl_domain = insn_impl_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            desired_domain = desired_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_impl_domain.dim(dim_type.param)))

            lines = []
            for bigger, smaller, diff_set, gist_domain in [
                    ("implemented", "desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired", "implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s but not %s: %s" % (
                            bigger, smaller, ", ".join(point_axes)))
                lines.append(
                        "gist of constraints in %s but not %s: %s" % (
                            smaller, bigger, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.target.execution import get_highlighted_code
                print(get_highlighted_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True
示例#26
0
    def __init__(self,
                 name,
                 dtype=None,
                 shape=None,
                 dim_tags=None,
                 offset=0,
                 dim_names=None,
                 strides=None,
                 order=None,
                 for_atomic=False,
                 target=None,
                 alignment=None,
                 **kwargs):
        """
        All of the following (except *name*) are optional.
        Specify either strides or shape.

        :arg name: When passed to :class:`loopy.make_kernel`, this may contain
            multiple names separated by commas, in which case multiple arguments,
            each with identical properties, are created for each name.

        :arg shape: May be any of the things specified under :attr:`shape`,
            or a string which can be parsed into the previous form.

        :arg dim_tags: A comma-separated list of tags as understood by
            :func:`loopy.kernel.array.parse_array_dim_tags`.

        :arg strides: May be one of the following:

            * None

            * :class:`loopy.auto`. The strides will be determined by *order*
              and the access footprint.

            * a tuple like like :attr:`numpy.ndarray.shape`.

              Each entry of the tuple is also allowed to be a :mod:`pymbolic`
              expression involving kernel parameters, or a (potentially-comma
              separated) or a string that can be parsed to such an expression.

            * A string which can be parsed into the previous form.

        :arg order: "F" or "C" for C (row major) or Fortran
            (column major). Defaults to the *default_order* argument
            passed to :func:`loopy.make_kernel`.
        :arg for_atomic:
            Whether the array is declared for atomic access, and, if necessary,
            using atomic-capable data types.
        :arg offset: (See :attr:`offset`)
        :arg alignment: memory alignment in bytes

        """

        for kwarg_name in kwargs:
            if kwarg_name not in self.allowed_extra_kwargs:
                raise TypeError("invalid kwarg: %s" % kwarg_name)

        import loopy as lp

        from loopy.types import to_loopy_type
        dtype = to_loopy_type(dtype,
                              allow_auto=True,
                              allow_none=True,
                              for_atomic=for_atomic,
                              target=target)

        if dtype is lp.auto:
            from warnings import warn
            warn(
                "Argument/temporary data type for '%s' should be None if "
                "unspecified, not auto. This usage will be disallowed in 2018."
                % name,
                DeprecationWarning,
                stacklevel=2)

            dtype = None

        strides_known = strides is not None and strides is not lp.auto
        shape_known = shape is not None and shape is not lp.auto

        if strides_known:
            strides = _parse_shape_or_strides(strides)

        if shape_known:
            shape = _parse_shape_or_strides(shape)

        # {{{ check dim_names

        if dim_names is not None:
            if len(dim_names) != len(set(dim_names)):
                raise LoopyError("dim_names are not unique")

            for n in dim_names:
                if not isinstance(n, str):
                    raise LoopyError("found non-string '%s' in dim_names" %
                                     type(n).__name__)

        # }}}

        # {{{ convert strides to dim_tags (Note: strides override order)

        if dim_tags is not None and strides_known:
            raise TypeError("may not specify both strides and dim_tags")

        if dim_tags is None and strides_known:
            dim_tags = [FixedStrideArrayDimTag(s) for s in strides]
            strides = None

        # }}}

        if dim_tags is not None:
            dim_tags = parse_array_dim_tags(
                dim_tags,
                n_axes=(len(shape) if shape_known else None),
                use_increasing_target_axes=self.max_target_axes > 1,
                dim_names=dim_names)

        # {{{ determine number of user axes

        num_user_axes = None
        if shape_known:
            num_user_axes = len(shape)
        for dim_iterable in [dim_tags, dim_names]:
            if dim_iterable is not None:
                new_num_user_axes = len(dim_iterable)

                if num_user_axes is None:
                    num_user_axes = new_num_user_axes
                else:
                    if new_num_user_axes != num_user_axes:
                        raise LoopyError(
                            "contradictory values for number of "
                            "dimensions of array '%s' from shape, strides, "
                            "dim_tags, or dim_names" % name)

                del new_num_user_axes

        # }}}

        # {{{ convert order to dim_tags

        if order is None and self.max_target_axes > 1:
            # FIXME: Hackety hack. ImageArgs need to generate dim_tags even
            # if no order is specified. Plus they don't care that much.
            order = "C"

        if dim_tags is None and num_user_axes is not None and order is not None:
            dim_tags = parse_array_dim_tags(
                num_user_axes * [order],
                n_axes=num_user_axes,
                use_increasing_target_axes=self.max_target_axes > 1,
                dim_names=dim_names)
            order = None

        # }}}

        if dim_tags is not None:
            # {{{ find number of target axes

            target_axes = set()
            for dim_tag in dim_tags:
                if isinstance(dim_tag, _StrideArrayDimTagBase):
                    target_axes.add(dim_tag.target_axis)

            if target_axes != set(range(len(target_axes))):
                raise LoopyError("target axes for variable '%s' are non-"
                                 "contiguous" % self.name)

            num_target_axes = len(target_axes)
            del target_axes

            # }}}

            if not (self.min_target_axes <= num_target_axes <=
                    self.max_target_axes):
                raise LoopyError(
                    "%s only supports between %d and %d target axes "
                    "('%s' has %d)" %
                    (type(self).__name__, self.min_target_axes,
                     self.max_target_axes, self.name, num_target_axes))

            new_dim_tags = convert_computed_to_fixed_dim_tags(
                name, num_user_axes, num_target_axes, shape, dim_tags)

            if new_dim_tags is not None:
                # successfully normalized
                dim_tags = new_dim_tags
                del new_dim_tags

        if dim_tags is not None:
            # for hashability
            dim_tags = tuple(dim_tags)
            order = None

        if strides is not None:
            # Preserve strides if we weren't able to process them yet.
            # That only happens if they're set to loopy.auto (and 'guessed'
            # in loopy.kernel.creation).

            kwargs["strides"] = strides

        if dim_names is not None and not isinstance(dim_names, tuple):
            from warnings import warn
            warn("dim_names is not a tuple when calling ArrayBase constructor",
                 DeprecationWarning,
                 stacklevel=2)

        ImmutableRecord.__init__(self,
                                 name=name,
                                 dtype=dtype,
                                 shape=shape,
                                 dim_tags=dim_tags,
                                 offset=offset,
                                 dim_names=dim_names,
                                 order=order,
                                 alignment=alignment,
                                 for_atomic=for_atomic,
                                 **kwargs)
示例#27
0
def auto_test_vs_ref(ref_knl,
                     ctx,
                     test_knl=None,
                     op_count=[],
                     op_label=[],
                     parameters={},
                     print_ref_code=False,
                     print_code=True,
                     warmup_rounds=2,
                     dump_binary=False,
                     fills_entire_output=None,
                     do_check=True,
                     check_result=None,
                     max_test_kernel_count=1,
                     quiet=False,
                     blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                         "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

    from loopy.compiled import CompiledKernel
    from loopy.target.execution import get_highlighted_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated",
             DeprecationWarning,
             stacklevel=2)

    # {{{ compile and run reference code

    from loopy.type_inference import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    from loopy.kernel.data import ImageArg
    need_ref_image_support = any(
        isinstance(arg, ImageArg) for arg in ref_knl.args)

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors,
                                                  need_ref_image_support):

        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(
            ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("{} (ref): trying {} for the reference calculation".format(
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75 * "-")
            print("Reference Code:")
            print(75 * "-")
            print(get_highlighted_code(ref_compiled.get_code()))
            print(75 * "-")

        ref_kernel_info = ref_compiled.kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75 * "-",
                    "On %s:" % dev, 75 * "-",
                    traceback.format_exc(), 75 * "-"
                ]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("{} (ref): using {} for the reference calculation".format(
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop - ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9 * (ref_evt.profile.END -
                                    ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                         "reference computation.\n"
                         "These errors were encountered:\n" +
                         "\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    from loopy.kernel import KernelState
    from loopy.target.pyopencl import PyOpenCLTarget
    if test_knl.state not in [
            KernelState.PREPROCESSED, KernelState.LINEARIZED
    ]:
        if isinstance(test_knl.target, PyOpenCLTarget):
            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))

        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.type_inference import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        kernel_info = compiled.kernel_info(frozenset())

        args = make_args(kernel, kernel_info.implemented_data_info, queue,
                         ref_arg_data, parameters)

        args["out_host"] = False

        if not quiet:
            print(75 * "-")
            print("Kernel #%d:" % i)
            print(75 * "-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75 * "-")
            if dump_binary:
                # {{{ find cl program

                for name in dir(kernel_info.cl_kernels):
                    if name.startswith("__"):
                        continue
                    cl_kernel = getattr(kernel_info.cl_kernels, name)
                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
                    break
                else:
                    assert False, "could not find cl_program"

                # }}}

                print(type(cl_program))
                if hasattr(cl_program, "binaries"):
                    print(cl_program.binaries[0])

                print(75 * "-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                        arg_desc.ref_storage_array.get(),
                        shape=arg_desc.ref_shape,
                        strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                        arg_desc.test_storage_array.get(),
                        shape=arg_desc.test_shape,
                        strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = max(warmup_rounds, 1)

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9 * evt_end.profile.START -
                                         1e-9 * evt_start.profile.START) /
                                        timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time - start_time) / timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl)

        if not quiet:

            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                  "(%d rounds)%s" %
                  (format_float_or_none(elapsed_event),
                   format_float_or_none(elapsed_event_marker),
                   format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: {:g} s event, {:g} s wall{}".format(
                    ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict
示例#28
0
文件: data.py 项目: shwina/loopy
def alias_temporaries(knl, names, base_name_prefix=None,
        synchronize_for_exclusive_use=True):
    """Sets all temporaries given by *names* to be backed by a single piece of
    storage.

    :arg synchronize_for_exclusive_use: A :class:`bool`. If ``True``, this also
        introduces ordering structures ("groups") to prevent the usage to ensure
        that the live ranges (i.e. the regions of code where each of the
        temporaries is used) do not overlap. This will allow two (or more)
        temporaries to share the same storage space as long as their live
        ranges do not need to be concurrent.
    :arg base_name_prefix: an identifier to be used for the common storage
        area

    .. versionchanged:: 2016.3

        Added *synchronize_for_exclusive_use* flag.
        ``synchronize_for_exclusive_use=True`` was the previous default
        behavior.
    """
    gng = knl.get_group_name_generator()
    group_names = [gng("tmpgrp_"+name) for name in names]

    if base_name_prefix is None:
        base_name_prefix = "temp_storage"

    vng = knl.get_var_name_generator()
    base_name = vng(base_name_prefix)

    names_set = set(names)

    if synchronize_for_exclusive_use:
        new_insns = []
        for insn in knl.instructions:
            temp_deps = insn.dependency_names() & names_set

            if not temp_deps:
                new_insns.append(insn)
                continue

            if len(temp_deps) > 1:
                raise LoopyError("Instruction {insn} refers to multiple of the "
                        "temporaries being aliased, namely '{temps}'. Cannot alias."
                        .format(
                            insn=insn.id,
                            temps=", ".join(temp_deps)))

            temp_name, = temp_deps
            temp_idx = names.index(temp_name)
            group_name = group_names[temp_idx]
            other_group_names = (
                    frozenset(group_names[:temp_idx])
                    | frozenset(group_names[temp_idx+1:]))

            new_insns.append(
                    insn.copy(
                        groups=insn.groups | frozenset([group_name]),
                        conflicts_with_groups=(
                            insn.conflicts_with_groups | other_group_names)))
    else:
        new_insns = knl.instructions

    new_temporary_variables = {}
    for tv in six.itervalues(knl.temporary_variables):
        if tv.name in names_set:
            if tv.base_storage is not None:
                raise LoopyError("temporary variable '{tv}' already has "
                        "a defined storage array -- cannot alias"
                        .format(tv=tv.name))

            new_temporary_variables[tv.name] = \
                    tv.copy(base_storage=base_name)
        else:
            new_temporary_variables[tv.name] = tv

    return knl.copy(
            instructions=new_insns,
            temporary_variables=new_temporary_variables)
示例#29
0
    def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary):
        """
        This is used for determining the amount of global storage needed for saving
        and restoring the temporary across kernel calls, due to hardware
        parallel inames (the inferred axes get prefixed to the number of
        dimensions in the temporary).

        In the case of local temporaries, inames that are tagged
        hw-local do not contribute to the global storage shape.
        """
        accessor_insn_ids = frozenset(
            self.kernel.reader_map()[temporary.name]
            | self.kernel.writer_map()[temporary.name])

        group_tags = None
        local_tags = None

        def _sortedtags(tags):
            return sorted(tags, key=lambda tag: tag.axis)

        for insn_id in accessor_insn_ids:
            insn = self.kernel.id_to_insn[insn_id]

            my_group_tags = []
            my_local_tags = []

            for iname in insn.within_inames:
                tag = self.kernel.iname_to_tag.get(iname)

                if tag is None:
                    continue

                from loopy.kernel.data import (GroupIndexTag, LocalIndexTag,
                                               ParallelTag)

                if isinstance(tag, GroupIndexTag):
                    my_group_tags.append(tag)
                elif isinstance(tag, LocalIndexTag):
                    my_local_tags.append(tag)
                elif isinstance(tag, ParallelTag):
                    raise LoopyError("iname '%s' is tagged with '%s' - only "
                                     "group and local tags are supported for "
                                     "auto save/reload of temporaries" %
                                     (iname, tag))

            if group_tags is None:
                group_tags = _sortedtags(my_group_tags)
                local_tags = _sortedtags(my_local_tags)
                group_tags_originating_insn_id = insn_id

            if (group_tags != _sortedtags(my_group_tags)
                    or local_tags != _sortedtags(my_local_tags)):
                raise LoopyError(
                    "inconsistent parallel tags across instructions that access "
                    "'%s' (specifically, instruction '%s' has tags '%s' but "
                    "instruction '%s' has tags '%s')" %
                    (temporary.name, group_tags_originating_insn_id, group_tags
                     + local_tags, insn_id, my_group_tags + my_local_tags))

        if group_tags is None:
            assert local_tags is None
            return (), ()

        group_sizes, local_sizes = (
            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)
        )

        if temporary.scope == lp.temp_var_scope.LOCAL:
            # Elide local axes in the save slot for local temporaries.
            del local_tags[:]
            local_sizes = ()

        # We set hw_dims to be arranged according to the order:
        #    g.0 < g.1 < ... < l.0 < l.1 < ...
        return (group_sizes + local_sizes), tuple(group_tags + local_tags)
示例#30
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import kernel_state
    if kernel.state == kernel_state.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.debug("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.type_inference import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                    arg.decl_info(
                        kernel.target,
                        is_written=is_written,
                        index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(ImplementedDataInfo(
                target=kernel.target,
                name=arg.name,
                dtype=arg.dtype,
                arg_class=ValueArg,
                is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
            kernel=kernel,
            implemented_data_info=implemented_data_info,
            implemented_domain=initial_implemented_domain,
            implemented_predicates=frozenset(),
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            seen_atomic_dtypes=seen_atomic_dtypes,
            var_subst_map={},
            allow_complex=allow_complex,
            var_name_generator=kernel.get_var_name_generator(),
            is_generating_device_code=False,
            gen_program_name=(
                kernel.target.host_program_name_prefix
                + kernel.name
                + kernel.target.host_program_name_suffix),
            schedule_index_end=len(kernel.schedule))

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(
            codegen_state,
            schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel, codegen_result.implemented_domains,
            device_code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    preamble_info = PreambleInfo(
            kernel=kernel,
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            # a set of LoopyTypes (!)
            seen_atomic_dtypes=seen_atomic_dtypes)

    preamble_generators = (kernel.preamble_generators
            + kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    codegen_result = codegen_result.copy(device_preambles=preambles)

    # }}}

    logger.info("%s: generate code: done" % kernel.name)

    if CACHING_ENABLED:
        code_gen_cache[input_kernel] = codegen_result

    return codegen_result