예제 #1
0
def rename_argument(kernel, old_name, new_name, existing_ok=False):
    """
    .. versionadded:: 2016.2
    """

    var_name_gen = kernel.get_var_name_generator()

    if old_name not in kernel.arg_dict:
        raise LoopyError("old arg name '%s' does not exist" % old_name)

    does_exist = var_name_gen.is_name_conflicting(new_name)

    if does_exist and not existing_ok:
        raise LoopyError(
            "argument name '%s' conflicts with an existing identifier"
            "--cannot rename" % new_name)

    # {{{ instructions

    from pymbolic import var
    subst_dict = {old_name: var(new_name)}

    from loopy.symbolic import (RuleAwareSubstitutionMapper,
                                SubstitutionRuleMappingContext)
    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, var_name_gen)
    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
                                       make_subst_func(subst_dict),
                                       within=lambda kernel, insn, stack: True)

    kernel = rule_mapping_context.finish_kernel(smap.map_kernel(kernel))

    # }}}

    # {{{ args

    new_args = []
    for arg in kernel.args:
        if arg.name == old_name:
            arg = arg.copy(name=new_name)

        new_args.append(arg)

    # }}}

    # {{{ domain/assumptions

    def rename_arg_in_basic_set(dom):
        dom_var_dict = dom.get_var_dict()
        if old_name in dom_var_dict:
            dt, pos = dom_var_dict[old_name]
            dom = dom.set_dim_name(dt, pos, new_name)

        return dom

    new_domains = []
    for dom in kernel.domains:
        dom = rename_arg_in_basic_set(dom)
        new_domains.append(dom)

    new_assumptions = rename_arg_in_basic_set(kernel.assumptions)

    # }}}

    return kernel.copy(domains=new_domains,
                       args=new_args,
                       assumptions=new_assumptions)
예제 #2
0
파일: check.py 프로젝트: tj-sun/loopy
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                tag = kernel.iname_to_tag.get(iname)

                if isinstance(tag, LocalIndexTag):
                    local_axes_used.add(tag.axis)
                elif isinstance(tag, GroupIndexTag):
                    group_axes_used.add(tag.axis)
                elif isinstance(tag, AutoLocalIndexTagBase):
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
예제 #3
0
def opencl_function_mangler(kernel, name, arg_dtypes):
    if not isinstance(name, str):
        return None

    # OpenCL has min(), max() for integer types
    if name in ["max", "min"] and len(arg_dtypes) == 2:
        dtype = np.find_common_type(
            [], [dtype.numpy_dtype for dtype in arg_dtypes])

        if dtype.kind == "i":
            result_dtype = NumpyType(dtype)
            return CallMangleInfo(target_name=name,
                                  result_dtypes=(result_dtype, ),
                                  arg_dtypes=2 * (result_dtype, ))

    if name == "pow" and len(arg_dtypes) == 2:
        dtype = np.find_common_type(
            [], [dtype.numpy_dtype for dtype in arg_dtypes])
        if dtype == np.float64:
            name = "powf64"
        elif dtype == np.float32:
            name = "powf32"
        else:
            raise LoopyTypeError(f"'pow' does not support type {dtype}.")

        result_dtype = NumpyType(dtype)
        return CallMangleInfo(target_name=name,
                              result_dtypes=(result_dtype, ),
                              arg_dtypes=2 * (result_dtype, ))

    if name == "dot":
        scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields[
            "s0"]
        return CallMangleInfo(target_name=name,
                              result_dtypes=(NumpyType(scalar_dtype), ),
                              arg_dtypes=(arg_dtypes[0], ) * 2)

    if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
        num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
        if len(arg_dtypes) != num_args:
            raise LoopyError("%s takes %d arguments (%d received)" %
                             (name, num_args, len(arg_dtypes)))

        dtype = np.find_common_type(
            [], [dtype.numpy_dtype for dtype in arg_dtypes])

        if dtype.kind == "c":
            raise LoopyError("%s does not support complex numbers" % name)

        result_dtype = NumpyType(dtype)
        return CallMangleInfo(target_name=name,
                              result_dtypes=(result_dtype, ),
                              arg_dtypes=(result_dtype, ) * num_args)

    if name in VECTOR_LITERAL_FUNCS:
        base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]

        if count != len(arg_dtypes):
            return None

        return CallMangleInfo(target_name="(%s%d) " % (base_tp_name, count),
                              result_dtypes=(kernel.target.vector_dtype(
                                  NumpyType(dtype), count), ),
                              arg_dtypes=(NumpyType(dtype), ) * count)

    return None
예제 #4
0
def generate_arg_setup(gen, kernel, implemented_data_info, options):
    import loopy as lp

    from loopy.kernel.data import KernelArgument
    from loopy.kernel.array import ArrayBase
    from loopy.symbolic import StringifyMapper
    from pymbolic import var

    gen("# {{{ set up array arguments")
    gen("")

    if not options.no_numpy:
        gen("_lpy_encountered_numpy = False")
        gen("_lpy_encountered_dev = False")
        gen("")

    args = []

    strify = StringifyMapper()

    expect_no_more_arguments = False

    for arg_idx, arg in enumerate(implemented_data_info):
        is_written = arg.base_name in kernel.get_written_variables()
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if not issubclass(arg.arg_class, KernelArgument):
            expect_no_more_arguments = True
            continue

        if expect_no_more_arguments:
            raise LoopyError("Further arguments encountered after arg info "
                             "describing a global temporary variable")

        if not issubclass(arg.arg_class, ArrayBase):
            args.append(arg.name)
            continue

        gen("# {{{ process %s" % arg.name)
        gen("")

        if not options.no_numpy:
            gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
            with Indentation(gen):
                gen("# synchronous, nothing to worry about")
                gen("%s = _lpy_cl_array.to_device("
                    "queue, %s, allocator=allocator)" % (arg.name, arg.name))
                gen("_lpy_encountered_numpy = True")
            gen("elif %s is not None:" % arg.name)
            with Indentation(gen):
                gen("_lpy_encountered_dev = True")

            gen("")

        if not options.skip_arg_checks and not is_written:
            gen("if %s is None:" % arg.name)
            with Indentation(gen):
                gen("raise RuntimeError(\"input argument '%s' must "
                    "be supplied\")" % arg.name)
                gen("")

        if (is_written and arg.arg_class is lp.ImageArg
                and not options.skip_arg_checks):
            gen("if %s is None:" % arg.name)
            with Indentation(gen):
                gen("raise RuntimeError(\"written image '%s' must "
                    "be supplied\")" % arg.name)
                gen("")

        if is_written and arg.shape is None and not options.skip_arg_checks:
            gen("if %s is None:" % arg.name)
            with Indentation(gen):
                gen("raise RuntimeError(\"written argument '%s' has "
                    "unknown shape and must be supplied\")" % arg.name)
                gen("")

        possibly_made_by_loopy = False

        # {{{ allocate written arrays, if needed

        if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
                and arg.shape is not None \
                and all(si is not None for si in arg.shape):

            if not isinstance(arg.dtype, NumpyType):
                raise LoopyError("do not know how to pass arg of type '%s'" %
                                 arg.dtype)

            possibly_made_by_loopy = True
            gen("_lpy_made_by_loopy = False")
            gen("")

            gen("if %s is None:" % arg.name)
            with Indentation(gen):
                num_axes = len(arg.strides)
                for i in range(num_axes):
                    gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))

                itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                for i in range(num_axes):
                    gen("_lpy_strides_%d = %s" %
                        (i, strify(itemsize * arg.unvec_strides[i])))

                if not options.skip_arg_checks:
                    for i in range(num_axes):
                        gen("assert _lpy_strides_%d > 0, "
                            "\"'%s' has negative stride in axis %d\"" %
                            (i, arg.name, i))

                sym_strides = tuple(
                    var("_lpy_strides_%d" % i) for i in range(num_axes))
                sym_shape = tuple(
                    var("_lpy_shape_%d" % i) for i in range(num_axes))

                alloc_size_expr = (
                    sum(astrd * (alen - 1)
                        for alen, astrd in zip(sym_shape, sym_strides)) +
                    itemsize)

                gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
                gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
                    "%(dtype)s, strides=%(strides)s, "
                    "data=allocator(_lpy_alloc_size), allocator=allocator)" %
                    dict(name=arg.name,
                         shape=strify(sym_shape),
                         strides=strify(sym_strides),
                         dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))

                if not options.skip_arg_checks:
                    for i in range(num_axes):
                        gen("del _lpy_shape_%d" % i)
                        gen("del _lpy_strides_%d" % i)
                    gen("del _lpy_alloc_size")
                    gen("")

                gen("_lpy_made_by_loopy = True")
                gen("")

        # }}}

        # {{{ argument checking

        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
                and not options.skip_arg_checks:
            if possibly_made_by_loopy:
                gen("if not _lpy_made_by_loopy:")
            else:
                gen("if True:")

            with Indentation(gen):
                gen("if %s.dtype != %s:" %
                    (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
                with Indentation(gen):
                    gen("raise TypeError(\"dtype mismatch on argument '%s' "
                        "(got: %%s, expected: %s)\" %% %s.dtype)" %
                        (arg.name, arg.dtype, arg.name))

                # {{{ generate shape checking code

                def strify_allowing_none(shape_axis):
                    if shape_axis is None:
                        return "None"
                    else:
                        return strify(shape_axis)

                def strify_tuple(t):
                    if len(t) == 0:
                        return "()"
                    else:
                        return "(%s,)" % ", ".join(
                            strify_allowing_none(sa) for sa in t)

                shape_mismatch_msg = (
                    "raise TypeError(\"shape mismatch on argument '%s' "
                    "(got: %%s, expected: %%s)\" "
                    "%% (%s.shape, %s))" %
                    (arg.name, arg.name, strify_tuple(arg.unvec_shape)))

                if kernel_arg.shape is None:
                    pass

                elif any(shape_axis is None
                         for shape_axis in kernel_arg.shape):
                    gen("if len(%s.shape) != %s:" %
                        (arg.name, len(arg.unvec_shape)))
                    with Indentation(gen):
                        gen(shape_mismatch_msg)

                    for i, shape_axis in enumerate(arg.unvec_shape):
                        if shape_axis is None:
                            continue

                        gen("if %s.shape[%d] != %s:" %
                            (arg.name, i, strify(shape_axis)))
                        with Indentation(gen):
                            gen(shape_mismatch_msg)

                else:  # not None, no Nones in tuple
                    gen("if %s.shape != %s:" %
                        (arg.name, strify(arg.unvec_shape)))
                    with Indentation(gen):
                        gen(shape_mismatch_msg)

                # }}}

                if arg.unvec_strides and kernel_arg.dim_tags:
                    itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                    sym_strides = tuple(itemsize * s_i
                                        for s_i in arg.unvec_strides)
                    gen("if %s.strides != %s:" %
                        (arg.name, strify(sym_strides)))
                    with Indentation(gen):
                        gen("raise TypeError(\"strides mismatch on "
                            "argument '%s' (got: %%s, expected: %%s)\" "
                            "%% (%s.strides, %s))" %
                            (arg.name, arg.name, strify(sym_strides)))

                if not arg.allows_offset:
                    gen("if %s.offset:" % arg.name)
                    with Indentation(gen):
                        gen("raise ValueError(\"Argument '%s' does not "
                            "allow arrays with offsets. Try passing "
                            "default_offset=loopy.auto to make_kernel()."
                            "\")" % arg.name)
                        gen("")

        # }}}

        if possibly_made_by_loopy and not options.skip_arg_checks:
            gen("del _lpy_made_by_loopy")
            gen("")

        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
            args.append("%s.base_data" % arg.name)
        else:
            args.append("%s" % arg.name)

        gen("")

        gen("# }}}")
        gen("")

    gen("# }}}")
    gen("")

    return args
예제 #5
0
파일: check.py 프로젝트: tj-sun/loopy
def check_loop_priority_inames_known(kernel):
    for prio in kernel.loop_priority:
        for iname in prio:
            if iname not in kernel.all_inames():
                raise LoopyError("unknown iname '%s' in loop priorities" % iname)
예제 #6
0
파일: instruction.py 프로젝트: tj-sun/loopy
    def __init__(self,
            assignees, expression,
            id=None,
            depends_on=None,
            depends_on_is_final=None,
            groups=None,
            conflicts_with_groups=None,
            no_sync_with=None,
            within_inames_is_final=None,
            within_inames=None,
            boostable=None, boostable_into=None, tags=None,
            temp_var_types=None,
            priority=0, predicates=frozenset(),
            insn_deps=None, insn_deps_is_final=None,
            forced_iname_deps=None,
            forced_iname_deps_is_final=None):

        super(CallInstruction, self).__init__(
                id=id,
                depends_on=depends_on,
                depends_on_is_final=depends_on_is_final,
                groups=groups,
                conflicts_with_groups=conflicts_with_groups,
                no_sync_with=no_sync_with,
                within_inames_is_final=within_inames_is_final,
                within_inames=within_inames,
                boostable=boostable,
                boostable_into=boostable_into,
                priority=priority,
                predicates=predicates,
                tags=tags,
                insn_deps=insn_deps,
                insn_deps_is_final=insn_deps_is_final,
                forced_iname_deps=forced_iname_deps,
                forced_iname_deps_is_final=forced_iname_deps_is_final)

        from pymbolic.primitives import Call
        from loopy.symbolic import Reduction
        if not isinstance(expression, (Call, Reduction)) and expression is not None:
            raise LoopyError("'expression' argument to CallInstruction "
                    "must be a function call")

        from loopy.symbolic import parse
        if isinstance(assignees, str):
            assignees = parse(assignees)
        if not isinstance(assignees, tuple):
            raise LoopyError("'assignees' argument to CallInstruction "
                    "must be a tuple or a string parseable to a tuple"
                    "--got '%s'" % type(assignees).__name__)

        if isinstance(expression, str):
            expression = parse(expression)

        from pymbolic.primitives import Variable, Subscript
        from loopy.symbolic import LinearSubscript
        for assignee in assignees:
            if not isinstance(assignee, (Variable, Subscript, LinearSubscript)):
                raise LoopyError("invalid lvalue '%s'" % assignee)

        self.assignees = assignees
        self.expression = expression

        if temp_var_types is None:
            self.temp_var_types = (None,) * len(self.assignees)
        else:
            self.temp_var_types = temp_var_types
예제 #7
0
파일: fusion.py 프로젝트: shigh/loopy
def _merge_values(item_name, val_a, val_b):
    if val_a != val_b:
        raise LoopyError("inconsistent %ss in merge: %s and %s" %
                         (item_name, val_a, val_b))

    return val_a
예제 #8
0
파일: expression.py 프로젝트: mmmika/loopy
    def map_subscript(self, expr, type_context):
        def base_impl(expr, type_context):
            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]

        def make_var(name):
            from loopy import TaggedVariable
            if isinstance(expr.aggregate, TaggedVariable):
                return TaggedVariable(name, expr.aggregate.tag)
            else:
                return var(name)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
                simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(self.kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        from loopy.kernel.data import (
                ImageArg, ArrayArg, TemporaryVariable, ConstantArg)

        if isinstance(ary, ImageArg):
            extra_axes = 0

            num_target_axes = ary.num_target_axes()
            if num_target_axes in [1, 2]:
                idx_vec_type = "float2"
                extra_axes = 2-num_target_axes
            elif num_target_axes == 3:
                idx_vec_type = "float4"
                extra_axes = 4-num_target_axes
            else:
                raise LoopyError("unsupported number (%d) of target axes in image"
                        % num_target_axes)

            idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes

            base_access = var("read_imagef")(
                    var(ary.name),
                    var("loopy_sampler"),
                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i')))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access.attr("x")
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return var("as_double")(base_access.attr("xy"))
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)):
            if len(access_info.subscripts) == 0:
                if (
                        (isinstance(ary, (ConstantArg, ArrayArg)) or
                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                    # unsubscripted global args are pointers
                    result = make_var(access_info.array_name)[0]

                else:
                    # unsubscripted temp vars are scalars
                    # (unless they use base_storage)
                    result = make_var(access_info.array_name)

            else:
                subscript, = access_info.subscripts
                result = make_var(access_info.array_name)[simplify_using_aff(
                    self.kernel, self.rec(subscript, 'i'))]

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
예제 #9
0
파일: expression.py 프로젝트: mmmika/loopy
    def map_call(self, expr, type_context):
        from pymbolic.primitives import Variable, Subscript

        identifier = expr.function

        # {{{ implement indexof, indexof_vec

        if identifier.name in ["indexof", "indexof_vec"]:
            if len(expr.parameters) != 1:
                raise LoopyError("%s takes exactly one argument" % identifier.name)
            arg, = expr.parameters
            if not isinstance(arg, Subscript):
                raise LoopyError(
                        "argument to %s must be a subscript" % identifier.name)

            ary = self.find_array(arg)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate
            access_info = get_access_info(self.kernel.target, ary, arg.index,
                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                    self.codegen_state.vectorization_info)

            from loopy.kernel.data import ImageArg
            if isinstance(ary, ImageArg):
                raise LoopyError("%s does not support images" % identifier.name)

            if identifier.name == "indexof":
                return access_info.subscripts[0]
            elif identifier.name == "indexof_vec":
                from loopy.kernel.array import VectorArrayDimTag
                ivec = None
                for iaxis, dim_tag in enumerate(ary.dim_tags):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        ivec = iaxis

                if ivec is None:
                    return access_info.subscripts[0]
                else:
                    return (
                        access_info.subscripts[0]*ary.shape[ivec]
                        + access_info.vector_index)

            else:
                raise RuntimeError("should not get here")

        # }}}

        if isinstance(identifier, Variable):
            identifier = identifier.name

        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)

        processed_parameters = None

        mangle_result = self.kernel.mangle_function(
                identifier, par_dtypes,
                ast_builder=self.codegen_state.ast_builder)

        if mangle_result is None:
            raise RuntimeError("function '%s' unknown--"
                    "maybe you need to register a function mangler?"
                    % identifier)

        if len(mangle_result.result_dtypes) != 1:
            raise LoopyError("functions with more or fewer than one return value "
                    "may not be used in an expression")

        if mangle_result.arg_dtypes is not None:
            processed_parameters = tuple(
                    self.rec(par,
                        dtype_to_type_context(self.kernel.target, tgt_dtype),
                        tgt_dtype)
                    for par, par_dtype, tgt_dtype in zip(
                        expr.parameters, par_dtypes, mangle_result.arg_dtypes))

        else:
            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
            # propagate the type context here. But for many others, it does
            # not. Using the inferred type as a stopgap for now.
            processed_parameters = tuple(
                    self.rec(par,
                        type_context=dtype_to_type_context(
                            self.kernel.target, par_dtype))
                    for par, par_dtype in zip(expr.parameters, par_dtypes))

            from warnings import warn
            warn("Calling function '%s' with unknown C signature--"
                    "return CallMangleInfo.arg_dtypes"
                    % identifier, LoopyWarning)

        from loopy.codegen import SeenFunction
        self.codegen_state.seen_functions.add(
                SeenFunction(identifier,
                    mangle_result.target_name,
                    mangle_result.arg_dtypes or par_dtypes))

        return var(mangle_result.target_name)(*processed_parameters)
예제 #10
0
    def assign_axis(recursion_axis, iname, axis=None):
        """Assign iname to local axis *axis* and start over by calling
        the surrounding function assign_automatic_axes.

        If *axis* is None, find a suitable axis automatically.
        """
        try:
            with isl.SuppressedWarnings(kernel.isl_context):
                desired_length = kernel.get_constant_iname_length(iname)
        except isl.Error:
            # Likely unbounded, automatic assignment is not
            # going to happen for this iname.
            new_iname_to_tag = kernel.iname_to_tag.copy()
            new_iname_to_tag[iname] = None
            return assign_automatic_axes(
                kernel.copy(iname_to_tag=new_iname_to_tag),
                axis=recursion_axis)

        if axis is None:
            # {{{ find a suitable axis

            shorter_possible_axes = []
            test_axis = 0
            while True:
                if test_axis >= len(local_size):
                    break
                if test_axis in assigned_local_axes:
                    test_axis += 1
                    continue

                if local_size[test_axis] < desired_length:
                    shorter_possible_axes.append(test_axis)
                    test_axis += 1
                    continue
                else:
                    axis = test_axis
                    break

            # The loop above will find an unassigned local axis
            # that has enough 'room' for the iname. In the same traversal,
            # it also finds theoretically assignable axes that are shorter,
            # in the variable shorter_possible_axes.

            if axis is None and shorter_possible_axes:
                # sort as longest first
                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
                axis = shorter_possible_axes[0]

            # }}}

        if axis is None:
            new_tag = None
        else:
            new_tag = LocalIndexTag(axis)
            if desired_length > local_size[axis]:
                from loopy import split_iname

                # Don't be tempted to switch the outer tag to unroll--this may
                # generate tons of code on some examples.

                return assign_automatic_axes(split_iname(
                    kernel,
                    iname,
                    inner_length=local_size[axis],
                    outer_tag=None,
                    inner_tag=new_tag,
                    do_tagged_check=False),
                                             axis=recursion_axis,
                                             local_size=local_size)

        if not isinstance(kernel.iname_to_tag.get(iname),
                          AutoLocalIndexTagBase):
            raise LoopyError("trying to reassign '%s'" % iname)

        new_iname_to_tag = kernel.iname_to_tag.copy()
        new_iname_to_tag[iname] = new_tag
        return assign_automatic_axes(
            kernel.copy(iname_to_tag=new_iname_to_tag),
            axis=recursion_axis,
            local_size=local_size)
예제 #11
0
파일: statistics.py 프로젝트: dokempf/loopy
def get_synchronization_poly(knl):
    """Count the number of synchronization events each thread encounters in a
    loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: A dictionary mapping each type of synchronization event to a
            :class:`islpy.PwQPolynomial` holding the number of such events
            per thread.

            Possible keys include ``barrier_local``, ``barrier_global``
            (if supported by the target) and ``kernel_launch``.

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel,
                                ReturnFromKernel, RunInstruction)
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []

    result = ToCountMap()

    one = isl.PwQPolynomial('{ 1 }')

    def get_count_poly(iname_list):
        if iname_list:  # (if iname_list is not empty)
            ct = (count(knl,
                        (knl.get_inames_domain(iname_list).project_out_except(
                            iname_list, [dim_type.set]))), )
            return reduce(mul, ct)
        else:
            return one

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()

        elif isinstance(sched_item, Barrier):
            result = result + ToCountMap(
                {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})

        elif isinstance(sched_item, CallKernel):
            result = result + ToCountMap(
                {"kernel_launch": get_count_poly(iname_list)})

        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
            pass

        else:
            raise LoopyError("unexpected schedule item: %s" %
                             type(sched_item).__name__)

    return result.dict
예제 #12
0
def get_auto_axis_iname_ranking_by_stride(kernel, insn):
    from loopy.kernel.data import ImageArg, ValueArg

    approximate_arg_values = {}
    for arg in kernel.args:
        if isinstance(arg, ValueArg):
            if arg.approximately is not None:
                approximate_arg_values[arg.name] = arg.approximately
            else:
                raise LoopyError(
                    "No approximate arg value specified for '%s'" % arg.name)

    # {{{ find all array accesses in insn

    from loopy.symbolic import ArrayAccessFinder
    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))

    from pymbolic.primitives import Subscript

    for assignee in insn.assignees:
        if isinstance(assignee, Subscript):
            ary_acc_exprs.append(assignee)

    # }}}

    # {{{ filter array accesses to only the global ones

    global_ary_acc_exprs = []

    for aae in ary_acc_exprs:
        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)
        if arg is None:
            continue

        if isinstance(arg, ImageArg):
            continue

        global_ary_acc_exprs.append(aae)

    # }}}

    # {{{ figure out automatic-axis inames

    from loopy.kernel.data import AutoLocalIndexTagBase
    auto_axis_inames = set(
        iname for iname in kernel.insn_inames(insn)
        if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase))

    # }}}

    # {{{ figure out which iname should get mapped to local axis 0

    # maps inames to "aggregate stride"
    aggregate_strides = {}

    from loopy.symbolic import CoefficientCollector
    from pymbolic.primitives import Variable

    for aae in global_ary_acc_exprs:
        index_expr = aae.index
        if not isinstance(index_expr, tuple):
            index_expr = (index_expr, )

        ary_name = aae.aggregate.name
        arg = kernel.arg_dict.get(ary_name)

        if arg.dim_tags is None:
            from warnings import warn
            warn("Strides for '%s' are not known. Local axis assignment "
                 "is likely suboptimal." % arg.name)
            ary_strides = [1] * len(index_expr)
        else:
            ary_strides = []
            from loopy.kernel.array import FixedStrideArrayDimTag
            for dim_tag in arg.dim_tags:
                if isinstance(dim_tag, FixedStrideArrayDimTag):
                    ary_strides.append(dim_tag.stride)

        # {{{ construct iname_to_stride_expr

        iname_to_stride_expr = {}
        for iexpr_i, stride in zip(index_expr, ary_strides):
            if stride is None:
                continue
            coeffs = CoefficientCollector()(iexpr_i)
            for var, coeff in six.iteritems(coeffs):
                if (isinstance(var, Variable)
                        and var.name in auto_axis_inames):
                    # excludes '1', i.e.  the constant
                    new_stride = coeff * stride
                    old_stride = iname_to_stride_expr.get(var.name, None)
                    if old_stride is None or new_stride < old_stride:
                        iname_to_stride_expr[var.name] = new_stride

        # }}}

        from pymbolic import evaluate
        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
            stride = evaluate(stride_expr, approximate_arg_values)
            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride

    if aggregate_strides:
        very_large_stride = int(np.iinfo(np.int32).max)

        return sorted((iname for iname in kernel.insn_inames(insn)),
                      key=lambda iname:
                      (aggregate_strides.get(iname, very_large_stride), iname))
    else:
        return None
예제 #13
0
def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
    """Return a string in the `dot <http://graphviz.org/>`_ language depicting
    dependencies among kernel instructions.
    """

    # make sure all automatically added stuff shows up
    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
    kernel = apply_single_writer_depencency_heuristic(kernel,
                                                      warn_if_used=False)

    if iname_cluster and not kernel.schedule:
        try:
            from loopy.schedule import get_one_scheduled_kernel
            kernel = get_one_scheduled_kernel(kernel)
        except RuntimeError as e:
            iname_cluster = False
            from warnings import warn
            warn("error encountered during scheduling for dep graph -- "
                 "cannot perform iname clustering: %s(%s)" %
                 (type(e).__name__, e))

    dep_graph = {}
    lines = []

    from loopy.kernel.data import MultiAssignmentBase, CInstruction

    for insn in kernel.instructions:
        if isinstance(insn, MultiAssignmentBase):
            op = "%s <- %s" % (insn.assignees, insn.expression)
            if len(op) > 200:
                op = op[:200] + "..."

        elif isinstance(insn, CInstruction):
            op = "<C instruction %s>" % insn.id
        else:
            op = "<instruction %s>" % insn.id

        if use_insn_id:
            insn_label = insn.id
            tooltip = op
        else:
            insn_label = op
            tooltip = insn.id

        lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" % (
            insn.id,
            repr(insn_label)[1:-1],
            repr(tooltip)[1:-1],
        ))
        for dep in insn.depends_on:
            dep_graph.setdefault(insn.id, set()).add(dep)

    # {{{ O(n^3) transitive reduction

    # first, compute transitive closure by fixed point iteration
    while True:
        changed_something = False

        for insn_1 in dep_graph:
            for insn_2 in dep_graph.get(insn_1, set()).copy():
                for insn_3 in dep_graph.get(insn_2, set()).copy():
                    if insn_3 not in dep_graph.get(insn_1, set()):
                        changed_something = True
                        dep_graph[insn_1].add(insn_3)

        if not changed_something:
            break

    for insn_1 in dep_graph:
        for insn_2 in dep_graph.get(insn_1, set()).copy():
            for insn_3 in dep_graph.get(insn_2, set()).copy():
                if insn_3 in dep_graph.get(insn_1, set()):
                    dep_graph[insn_1].remove(insn_3)

    # }}}

    for insn_1 in dep_graph:
        for insn_2 in dep_graph.get(insn_1, set()):
            lines.append("%s -> %s" % (insn_2, insn_1))

    if iname_cluster:
        from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction,
                                    Barrier, CallKernel, ReturnFromKernel)

        for sched_item in kernel.schedule:
            if isinstance(sched_item, EnterLoop):
                lines.append("subgraph cluster_%s { label=\"%s\"" %
                             (sched_item.iname, sched_item.iname))
            elif isinstance(sched_item, LeaveLoop):
                lines.append("}")
            elif isinstance(sched_item, RunInstruction):
                lines.append(sched_item.insn_id)
            elif isinstance(sched_item,
                            (CallKernel, ReturnFromKernel, Barrier)):
                pass
            else:
                raise LoopyError("schedule item not unterstood: %r" %
                                 sched_item)

    return "digraph %s {\n%s\n}" % (kernel.name, "\n".join(lines))
예제 #14
0
def guess_var_shape(kernel, var_name):
    from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper

    armap = AccessRangeMapper(kernel, var_name)

    submap = SubstitutionRuleExpander(kernel.substitutions)

    def run_through_armap(expr):
        armap(submap(expr), kernel.insn_inames(insn))
        return expr

    try:
        for insn in kernel.instructions:
            insn.with_transformed_expressions(run_through_armap)
    except TypeError as e:
        from traceback import print_exc
        print_exc()

        raise LoopyError(
            "Failed to (automatically, as requested) find "
            "shape/strides for variable '%s'. "
            "Specifying the shape manually should get rid of this. "
            "The following error occurred: %s" % (var_name, str(e)))

    if armap.access_range is None:
        if armap.bad_subscripts:
            from loopy.symbolic import LinearSubscript
            if any(
                    isinstance(sub, LinearSubscript)
                    for sub in armap.bad_subscripts):
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "linear subscript(s) in '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

            n_axes_in_subscripts = set(
                len(sub.index_tuple) for sub in armap.bad_subscripts)

            if len(n_axes_in_subscripts) != 1:
                raise RuntimeError("subscripts of '%s' with differing "
                                   "numbers of axes were found" % var_name)

            n_axes, = n_axes_in_subscripts

            if n_axes == 1:
                # Leave shape undetermined--we can live with that for 1D.
                shape = (None, )
            else:
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "undetermined index in subscript(s) '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

        else:
            # no subscripts found, let's call it a scalar
            shape = ()
    else:
        from loopy.isl_helpers import static_max_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr

        shape = []
        for i in range(armap.access_range.dim(dim_type.set)):
            try:
                shape.append(
                    pw_aff_to_expr(
                        static_max_of_pw_aff(kernel.cache_manager.dim_max(
                            armap.access_range, i) + 1,
                                             constants_only=False)))
            except:
                print("While trying to find shape axis %d of "
                      "variable '%s', the following "
                      "exception occurred:" % (i, var_name),
                      file=sys.stderr)
                print("*** ADVICE: You may need to manually specify the "
                      "shape of argument '%s'." % (var_name),
                      file=sys.stderr)
                raise

        shape = tuple(shape)

    return shape
예제 #15
0
def infer_unknown_types(kernel, expect_completion=False):
    """Infer types on temporaries and arguments."""

    logger.debug("%s: infer types" % kernel.name)

    from functools import partial
    debug = partial(_debug, kernel)

    import time
    start_time = time.time()

    unexpanded_kernel = kernel
    if kernel.substitutions:
        from loopy.transform.subst import expand_subst
        kernel = expand_subst(kernel)

    new_temp_vars = kernel.temporary_variables.copy()
    new_arg_dict = kernel.arg_dict.copy()

    # {{{ find names_with_unknown_types

    # contains both arguments and temporaries
    names_for_type_inference = []

    import loopy as lp
    for tv in six.itervalues(kernel.temporary_variables):
        if tv.dtype is lp.auto:
            names_for_type_inference.append(tv.name)

    for arg in kernel.args:
        if arg.dtype is None:
            names_for_type_inference.append(arg.name)

    # }}}

    logger.debug("finding types for {count:d} names".format(
        count=len(names_for_type_inference)))

    writer_map = kernel.writer_map()

    dep_graph = dict((
        written_var,
        set(read_var for insn_id in writer_map.get(written_var, [])
            for read_var in kernel.id_to_insn[insn_id].read_dependency_names()
            if read_var in names_for_type_inference))
                     for written_var in names_for_type_inference)

    from loopy.tools import compute_sccs

    # To speed up processing, we sort the variables by computing the SCCs of the
    # type dependency graph. Each SCC represents a set of variables whose types
    # mutually depend on themselves. The SCCs are returned and processed in
    # topological order.
    sccs = compute_sccs(dep_graph)

    item_lookup = _DictUnionView([new_temp_vars, new_arg_dict])
    type_inf_mapper = TypeInferenceMapper(kernel, item_lookup)

    from loopy.symbolic import SubstitutionRuleExpander
    subst_expander = SubstitutionRuleExpander(kernel.substitutions)

    # {{{ work on type inference queue

    from loopy.kernel.data import TemporaryVariable, KernelArgument

    for var_chain in sccs:
        changed_during_last_queue_run = False
        queue = var_chain[:]
        failed_names = set()

        while queue or changed_during_last_queue_run:
            if not queue and changed_during_last_queue_run:
                changed_during_last_queue_run = False
                # Optimization: If there's a single variable in the SCC without
                # a self-referential dependency, then the type is known after a
                # single iteration (we don't need to look at the expressions
                # again).
                if len(var_chain) == 1:
                    single_var, = var_chain
                    if single_var not in dep_graph[single_var]:
                        break
                queue = var_chain[:]

            name = queue.pop(0)
            item = item_lookup[name]

            debug("inferring type for %s %s", type(item).__name__, item.name)

            result, symbols_with_unavailable_types = (_infer_var_type(
                kernel, item.name, type_inf_mapper, subst_expander))

            failed = not result
            if not failed:
                new_dtype, = result
                debug("     success: %s", new_dtype)
                if new_dtype != item.dtype:
                    debug("     changed from: %s", item.dtype)
                    changed_during_last_queue_run = True

                    if isinstance(item, TemporaryVariable):
                        new_temp_vars[name] = item.copy(dtype=new_dtype)
                    elif isinstance(item, KernelArgument):
                        new_arg_dict[name] = item.copy(dtype=new_dtype)
                    else:
                        raise LoopyError(
                            "unexpected item type in type inference")
            else:
                debug("     failure")

            if failed:
                if item.name in failed_names:
                    # this item has failed before, give up.
                    advice = ""
                    if symbols_with_unavailable_types:
                        advice += (
                            " (need type of '%s'--check for missing arguments)"
                            % ", ".join(symbols_with_unavailable_types))

                    if expect_completion:
                        raise LoopyError("could not determine type of '%s'%s" %
                                         (item.name, advice))

                    else:
                        # We're done here.
                        break

                # remember that this item failed
                failed_names.add(item.name)

                if set(queue) == failed_names:
                    # We did what we could...
                    print(queue, failed_names, item.name)
                    assert not expect_completion
                    break

                # can't infer type yet, put back into queue
                queue.append(name)
            else:
                # we've made progress, reset failure markers
                failed_names = set()

    # }}}

    end_time = time.time()
    logger.debug("type inference took {dur:.2f} seconds".format(dur=end_time -
                                                                start_time))

    return unexpanded_kernel.copy(
        temporary_variables=new_temp_vars,
        args=[new_arg_dict[arg.name] for arg in kernel.args],
    )
예제 #16
0
파일: expression.py 프로젝트: mmmika/loopy
 def map_group_hw_index(self, expr, type_context):
     raise LoopyError("plain C does not have group hw axes")
예제 #17
0
파일: instruction.py 프로젝트: tj-sun/loopy
    def __init__(self, id, depends_on, depends_on_is_final,
            groups, conflicts_with_groups,
            no_sync_with,
            within_inames_is_final, within_inames,
            priority,
            boostable, boostable_into, predicates, tags,
            insn_deps=None, insn_deps_is_final=None,
            forced_iname_deps=None, forced_iname_deps_is_final=None):

        # {{{ backwards compatibility goop

        if depends_on is not None and insn_deps is not None:
            raise LoopyError("may not specify both insn_deps and depends_on")
        elif insn_deps is not None:
            warn("insn_deps is deprecated, use depends_on",
                    DeprecationWarning, stacklevel=2)

            depends_on = insn_deps
            depends_on_is_final = insn_deps_is_final

        if forced_iname_deps is not None and within_inames is not None:
            raise LoopyError("may not specify both forced_iname_deps "
                    "and within_inames")
        elif forced_iname_deps is not None:
            warn("forced_iname_deps is deprecated, use within_inames",
                    DeprecationWarning, stacklevel=2)

            within_inames = forced_iname_deps
            within_inames_is_final = forced_iname_deps_is_final

        if predicates is None:
            predicates = frozenset()

        new_predicates = set()
        for pred in predicates:
            if isinstance(pred, str):
                from pymbolic.primitives import LogicalNot
                from loopy.symbolic import parse
                if pred.startswith("!"):
                    warn("predicates starting with '!' are deprecated. "
                            "Simply use 'not' instead")
                    pred = LogicalNot(parse(pred[1:]))
                else:
                    pred = parse(pred)

            new_predicates.add(pred)

        predicates = frozenset(new_predicates)
        del new_predicates

        # }}}

        if depends_on is None:
            depends_on = frozenset()

        if groups is None:
            groups = frozenset()

        if conflicts_with_groups is None:
            conflicts_with_groups = frozenset()

        if no_sync_with is None:
            no_sync_with = frozenset()

        if within_inames is None:
            within_inames = frozenset()

        if within_inames_is_final is None:
            within_inames_is_final = False

        if isinstance(depends_on, str):
            depends_on = frozenset(
                    s.strip() for s in depends_on.split(",") if s.strip())

        if depends_on_is_final is None:
            depends_on_is_final = False

        if depends_on_is_final and not isinstance(depends_on, frozenset):
            raise LoopyError("Setting depends_on_is_final to True requires "
                    "actually specifying depends_on")

        if tags is None:
            tags = frozenset()

        if priority is None:
            priority = 0

        if not isinstance(tags, frozenset):
            # was previously allowed to be tuple
            tags = frozenset(tags)

        # Periodically reenable these and run the tests to ensure all
        # performance-relevant identifiers are interned.
        #
        # from loopy.tools import is_interned
        # assert is_interned(id)
        # assert all(is_interned(dep) for dep in depends_on)
        # assert all(is_interned(grp) for grp in groups)
        # assert all(is_interned(grp) for grp in conflicts_with_groups)
        # assert all(is_interned(iname) for iname in within_inames)
        # assert all(is_interned(pred) for pred in predicates)

        assert isinstance(within_inames, frozenset)
        assert isinstance(depends_on, frozenset) or depends_on is None
        assert isinstance(groups, frozenset)
        assert isinstance(conflicts_with_groups, frozenset)

        ImmutableRecord.__init__(self,
                id=id,
                depends_on=depends_on,
                depends_on_is_final=depends_on_is_final,
                no_sync_with=no_sync_with,
                groups=groups, conflicts_with_groups=conflicts_with_groups,
                within_inames_is_final=within_inames_is_final,
                within_inames=within_inames,
                priority=priority,
                boostable=boostable,
                boostable_into=boostable_into,
                predicates=predicates,
                tags=tags)
예제 #18
0
파일: expression.py 프로젝트: mmmika/loopy
 def map_local_hw_index(self, expr, type_context):
     raise LoopyError("plain C does not have local hw axes")
예제 #19
0
    def generate_arg_setup(
            self, gen, kernel, implemented_data_info, options):
        import loopy as lp

        from loopy.kernel.data import KernelArgument
        from loopy.kernel.array import ArrayBase
        from loopy.symbolic import StringifyMapper
        from loopy.types import NumpyType

        gen("# {{{ set up array arguments")
        gen("")

        if not options.no_numpy:
            gen("_lpy_encountered_numpy = False")
            gen("_lpy_encountered_dev = False")
            gen("")

        args = []

        strify = StringifyMapper()

        expect_no_more_arguments = False

        for arg in implemented_data_info:
            is_written = arg.base_name in kernel.get_written_variables()
            kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

            if not issubclass(arg.arg_class, KernelArgument):
                expect_no_more_arguments = True
                continue

            if expect_no_more_arguments:
                raise LoopyError("Further arguments encountered after arg info "
                        "describing a global temporary variable")

            if not issubclass(arg.arg_class, ArrayBase):
                args.append(arg.name)
                continue

            gen("# {{{ process %s" % arg.name)
            gen("")

            if not options.no_numpy:
                self.handle_non_numpy_arg(gen, arg)

            if not options.skip_arg_checks and not is_written:
                gen("if %s is None:" % arg.name)
                with Indentation(gen):
                    gen("raise RuntimeError(\"input argument '%s' must "
                            'be supplied")' % arg.name)
                    gen("")

            if (is_written
                    and arg.arg_class is lp.ImageArg
                    and not options.skip_arg_checks):
                gen("if %s is None:" % arg.name)
                with Indentation(gen):
                    gen("raise RuntimeError(\"written image '%s' must "
                            'be supplied")' % arg.name)
                    gen("")

            if is_written and arg.shape is None and not options.skip_arg_checks:
                gen("if %s is None:" % arg.name)
                with Indentation(gen):
                    gen("raise RuntimeError(\"written argument '%s' has "
                            'unknown shape and must be supplied")' % arg.name)
                    gen("")

            possibly_made_by_loopy = False

            # {{{ allocate written arrays, if needed

            if is_written and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
                    and arg.shape is not None \
                    and all(si is not None for si in arg.shape):

                if not isinstance(arg.dtype, NumpyType):
                    raise LoopyError("do not know how to pass arg of type '%s'"
                            % arg.dtype)

                possibly_made_by_loopy = True
                gen("_lpy_made_by_loopy = False")
                gen("")

                gen("if %s is None:" % arg.name)
                with Indentation(gen):
                    self.handle_alloc(
                        gen, arg, kernel_arg, strify, options.skip_arg_checks)
                    gen("_lpy_made_by_loopy = True")
                    gen("")

            # }}}

            # {{{ argument checking

            if arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
                    and not options.skip_arg_checks:
                if possibly_made_by_loopy:
                    gen("if not _lpy_made_by_loopy:")
                else:
                    gen("if True:")

                with Indentation(gen):
                    gen("if %s.dtype != %s:"
                            % (arg.name, self.python_dtype_str(
                                gen, kernel_arg.dtype.numpy_dtype)))
                    with Indentation(gen):
                        gen("raise TypeError(\"dtype mismatch on argument '%s' "
                                '(got: %%s, expected: %s)" %% %s.dtype)'
                                % (arg.name, arg.dtype, arg.name))

                    # {{{ generate shape checking code

                    def strify_allowing_none(shape_axis):
                        if shape_axis is None:
                            return "None"
                        else:
                            return strify(shape_axis)

                    def strify_tuple(t):
                        if len(t) == 0:
                            return "()"
                        else:
                            return "(%s,)" % ", ".join(
                                    strify_allowing_none(sa)
                                    for sa in t)

                    shape_mismatch_msg = (
                            "raise TypeError(\"shape mismatch on argument '%s' "
                            '(got: %%s, expected: %%s)" '
                            "%% (%s.shape, %s))"
                            % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))

                    if kernel_arg.shape is None:
                        pass

                    elif any(shape_axis is None for shape_axis in kernel_arg.shape):
                        gen("if len(%s.shape) != %s:"
                                % (arg.name, len(arg.unvec_shape)))
                        with Indentation(gen):
                            gen(shape_mismatch_msg)

                        for i, shape_axis in enumerate(arg.unvec_shape):
                            if shape_axis is None:
                                continue

                            gen("if %s.shape[%d] != %s:"
                                    % (arg.name, i, strify(shape_axis)))
                            with Indentation(gen):
                                gen(shape_mismatch_msg)

                    else:  # not None, no Nones in tuple
                        gen("if %s.shape != %s:"
                                % (arg.name, strify(arg.unvec_shape)))
                        with Indentation(gen):
                            gen(shape_mismatch_msg)

                    # }}}

                    if arg.unvec_strides and kernel_arg.dim_tags:
                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                        sym_strides = tuple(
                                itemsize*s_i for s_i in arg.unvec_strides)

                        ndim = len(arg.unvec_shape)
                        shape = ["_lpy_shape_%d" % i for i in range(ndim)]
                        strides = ["_lpy_stride_%d" % i for i in range(ndim)]

                        gen("({},) = {}.shape".format(", ".join(shape), arg.name))
                        gen("({},) = {}.strides".format(
                            ", ".join(strides), arg.name))

                        gen("if not (%s):"
                                % self.get_strides_check_expr(
                                    shape, strides,
                                    (strify(s) for s in sym_strides)))
                        with Indentation(gen):
                            gen("_lpy_got = tuple(stride "
                                    "for (dim, stride) in zip(%s.shape, %s.strides) "
                                    "if dim > 1)"
                                    % (arg.name, arg.name))
                            gen("_lpy_expected = tuple(stride "
                                    "for (dim, stride) in zip(%s.shape, %s) "
                                    "if dim > 1)"
                                    % (arg.name, strify_tuple(sym_strides)))

                            gen('raise TypeError("strides mismatch on '
                                    "argument '%s' "
                                    "(after removing unit length dims, "
                                    'got: %%s, expected: %%s)" '
                                    "%% (_lpy_got, _lpy_expected))"
                                    % arg.name)

                    if not arg.allows_offset:
                        gen("if hasattr({}, 'offset') and {}.offset:".format(
                                arg.name, arg.name))
                        with Indentation(gen):
                            gen("raise ValueError(\"Argument '%s' does not "
                                    "allow arrays with offsets. Try passing "
                                    "default_offset=loopy.auto to make_kernel()."
                                    '")' % arg.name)
                            gen("")

            # }}}

            if possibly_made_by_loopy and not options.skip_arg_checks:
                gen("del _lpy_made_by_loopy")
                gen("")

            if arg.arg_class in [lp.ArrayArg, lp.ConstantArg]:
                args.append(self.get_arg_pass(arg))
            else:
                args.append("%s" % arg.name)

            gen("")

            gen("# }}}")
            gen("")

        gen("# }}}")
        gen("")

        return args
예제 #20
0
    def emit_multiple_assignment(self, codegen_state, insn):
        ecm = codegen_state.expression_to_code_mapper

        from pymbolic.primitives import Variable
        from pymbolic.mapper.stringifier import PREC_NONE

        func_id = insn.expression.function
        parameters = insn.expression.parameters

        if isinstance(func_id, Variable):
            func_id = func_id.name

        assignee_var_descriptors = [
                codegen_state.kernel.get_var_descriptor(a)
                for a in insn.assignee_var_names()]

        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)

        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
        if mangle_result is None:
            raise RuntimeError("function '%s' unknown--"
                    "maybe you need to register a function mangler?"
                    % func_id)

        assert mangle_result.arg_dtypes is not None

        if mangle_result.target_name == "loopy_make_tuple":
            # This shorcut avoids actually having to emit a 'make_tuple' function.
            return self.emit_tuple_assignment(codegen_state, insn)

        from loopy.expression import dtype_to_type_context
        c_parameters = [
                ecm(par, PREC_NONE,
                    dtype_to_type_context(self.target, tgt_dtype),
                    tgt_dtype).expr
                for par, par_dtype, tgt_dtype in zip(
                    parameters, par_dtypes, mangle_result.arg_dtypes)]

        from loopy.codegen import SeenFunction
        codegen_state.seen_functions.add(
                SeenFunction(func_id,
                    mangle_result.target_name,
                    mangle_result.arg_dtypes))

        from pymbolic import var
        for i, (a, tgt_dtype) in enumerate(
                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
            if tgt_dtype != ecm.infer_type(a):
                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
                        "side of instruction '%s'" % (i+1, insn.id))
            c_parameters.append(
                        # TODO Yuck: The "where-at function": &(...)
                        var("&")(
                            ecm(a, PREC_NONE,
                                dtype_to_type_context(self.target, tgt_dtype),
                                tgt_dtype).expr))

        from pymbolic import var
        result = var(mangle_result.target_name)(*c_parameters)

        # In case of no assignees, we are done
        if len(mangle_result.result_dtypes) == 0:
            from cgen import ExpressionStatement
            return ExpressionStatement(
                    CExpression(self.get_c_expression_to_code_mapper(), result))

        result = ecm.wrap_in_typecast(
                mangle_result.result_dtypes[0],
                assignee_var_descriptors[0].dtype,
                result)

        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)

        from cgen import Assign
        return Assign(
                lhs_code,
                CExpression(self.get_c_expression_to_code_mapper(), result))
예제 #21
0
파일: fusion.py 프로젝트: shigh/loopy
def _fuse_two_kernels(knla, knlb):
    from loopy.kernel import kernel_state
    if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL:
        raise LoopyError("can only fuse kernels in INITIAL state")

    # {{{ fuse domains

    new_domains = knla.domains[:]

    for dom_b in knlb.domains:
        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
        if i_fuse is None:
            new_domains.append(dom_b)
        else:
            dom_a = new_domains[i_fuse]
            dom_a, dom_b = isl.align_two(dom_a, dom_b)

            shared_inames = list(
                set(dom_a.get_var_dict(dim_type.set))
                & set(dom_b.get_var_dict(dim_type.set)))

            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])

            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
                raise LoopyError("kernels do not agree on domain of "
                                 "inames '%s'" % (",".join(shared_inames)))

            new_domain = dom_a & dom_b

            new_domains[i_fuse] = new_domain

    # }}}

    vng = knla.get_var_name_generator()
    b_var_renames = {}

    # {{{ fuse args

    new_args = knla.args[:]
    for b_arg in knlb.args:
        if b_arg.name not in knla.arg_dict:
            new_arg_name = vng(b_arg.name)

            if new_arg_name != b_arg.name:
                b_var_renames[b_arg.name] = var(new_arg_name)

            new_args.append(b_arg.copy(name=new_arg_name))
        else:
            if b_arg != knla.arg_dict[b_arg.name]:
                raise LoopyError(
                    "argument '{arg_name}' has inconsistent definition between "
                    "the two kernels being merged ({arg_a} <-> {arg_b})".
                    format(arg_name=b_arg.name,
                           arg_a=str(knla.arg_dict[b_arg.name]),
                           arg_b=str(b_arg)))

    # }}}

    # {{{ fuse temporaries

    new_temporaries = knla.temporary_variables.copy()
    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
        assert b_name == b_tv.name

        new_tv_name = vng(b_name)

        if new_tv_name != b_name:
            b_var_renames[b_name] = var(new_tv_name)

        assert new_tv_name not in new_temporaries
        new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name)

    # }}}

    knlb = _apply_renames_in_exprs(knlb, b_var_renames)

    from pymbolic.imperative.transform import \
            fuse_instruction_streams_with_unique_ids
    new_instructions, old_b_id_to_new_b_id = \
            fuse_instruction_streams_with_unique_ids(
                    knla.instructions, knlb.instructions)

    # {{{ fuse assumptions

    assump_a = knla.assumptions
    assump_b = knlb.assumptions
    assump_a, assump_b = isl.align_two(assump_a, assump_b)

    shared_param_names = list(
        set(assump_a.get_var_dict(dim_type.set))
        & set(assump_b.get_var_dict(dim_type.set)))

    assump_a_s = assump_a.project_out_except(shared_param_names,
                                             [dim_type.param])
    assump_b_s = assump_a.project_out_except(shared_param_names,
                                             [dim_type.param])

    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
        raise LoopyError("assumptions do not agree on kernels to be merged")

    new_assumptions = (assump_a & assump_b).params()

    # }}}

    from loopy.kernel import LoopKernel
    return LoopKernel(
        domains=new_domains,
        instructions=new_instructions,
        args=new_args,
        name="%s_and_%s" % (knla.name, knlb.name),
        preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
        preamble_generators=_ordered_merge_lists(knla.preamble_generators,
                                                 knlb.preamble_generators),
        assumptions=new_assumptions,
        local_sizes=_merge_dicts("local size", knla.local_sizes,
                                 knlb.local_sizes),
        temporary_variables=new_temporaries,
        iname_to_tag=_merge_dicts("iname-to-tag mapping", knla.iname_to_tag,
                                  knlb.iname_to_tag),
        substitutions=_merge_dicts("substitution", knla.substitutions,
                                   knlb.substitutions),
        function_manglers=_ordered_merge_lists(knla.function_manglers,
                                               knlb.function_manglers),
        symbol_manglers=_ordered_merge_lists(knla.symbol_manglers,
                                             knlb.symbol_manglers),
        iname_slab_increments=_merge_dicts("iname slab increment",
                                           knla.iname_slab_increments,
                                           knlb.iname_slab_increments),
        loop_priority=_ordered_merge_lists(knla.loop_priority,
                                           knlb.loop_priority),
        silenced_warnings=_ordered_merge_lists(knla.silenced_warnings,
                                               knlb.silenced_warnings),
        applied_iname_rewrites=_ordered_merge_lists(
            knla.applied_iname_rewrites, knlb.applied_iname_rewrites),
        index_dtype=_merge_values("index dtype", knla.index_dtype,
                                  knlb.index_dtype),
        target=_merge_values("target", knla.target, knlb.target),
        options=knla.options), old_b_id_to_new_b_id
예제 #22
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression,
                       prec=PREC_NONE,
                       type_context=rhs_type_context,
                       needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(
                kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                codegen_state.vectorization_info)

            from loopy.kernel.data import GlobalArg, TemporaryVariable

            if not isinstance(ary, (GlobalArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s" %
                                 type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children, )

            new_terms = []

            from loopy.kernel.data import LocalIndexTag
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable) and isinstance(
                        kernel.iname_to_tag.get(term.name), LocalIndexTag)
                        and kernel.iname_to_tag.get(term.name).axis == 0):
                    if saw_l0:
                        raise LoopyError("streaming store must have stride 1 "
                                         "in local index, got: %s" % subscript)
                    saw_l0 = True
                    continue
                else:
                    for dep in get_dependencies(term):
                        if (isinstance(kernel.iname_to_tag.get(dep),
                                       LocalIndexTag)
                                and kernel.iname_to_tag.get(dep).axis == 0):
                            raise LoopyError(
                                "streaming store must have stride 1 "
                                "in local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                                 "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                                 "data type")

            rhs_has_programindex = any(
                isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag)
                and kernel.iname_to_tag.get(dep).axis == 0
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                "streaming_store(%s + %s, %s)" %
                (access_info.array_name,
                 ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
예제 #23
0
    def __init__(self,
                 name,
                 dtype=None,
                 shape=auto,
                 address_space=None,
                 dim_tags=None,
                 offset=0,
                 dim_names=None,
                 strides=None,
                 order=None,
                 base_indices=None,
                 storage_shape=None,
                 base_storage=None,
                 initializer=None,
                 read_only=False,
                 _base_storage_access_may_be_aliasing=False,
                 **kwargs):
        """
        :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
        :arg shape: :class:`loopy.auto` or a shape tuple
        :arg base_indices: :class:`loopy.auto` or a tuple of base indices
        """

        scope = kwargs.pop("scope", None)
        if scope is not None:
            warn("Passing 'scope' is deprecated. Use 'address_space' instead.",
                 DeprecationWarning,
                 stacklevel=2)

            if address_space is not None:
                raise ValueError("only one of 'scope' and 'address_space' "
                                 "may be specified")
            else:
                address_space = scope

        del scope

        if address_space is None:
            address_space = auto

        if address_space is None:
            raise LoopyError("temporary variable '%s': "
                             "address_space must not be None" % name)

        if initializer is None:
            pass
        elif isinstance(initializer, np.ndarray):
            if offset != 0:
                raise LoopyError("temporary variable '%s': "
                                 "offset must be 0 if initializer specified" %
                                 name)

            from loopy.types import NumpyType, to_loopy_type
            if dtype is auto or dtype is None:
                dtype = NumpyType(initializer.dtype)
            elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype):
                raise LoopyError("temporary variable '%s': "
                                 "dtype of initializer does not match "
                                 "dtype of array." % name)

            if shape is auto:
                shape = initializer.shape
            else:
                if shape != initializer.shape:
                    raise LoopyError("Shape of '{}' does not match that of the"
                                     " initializer.".format(name))
        else:
            raise LoopyError("temporary variable '%s': "
                             "initializer must be None or a numpy array" %
                             name)

        if order is None:
            order = "C"

        if base_indices is None and shape is not auto:
            base_indices = (0, ) * len(shape)

        if not read_only and initializer is not None:
            raise LoopyError("temporary variable '%s': "
                             "read-write variables with initializer "
                             "are not currently supported "
                             "(did you mean to set read_only=True?)" % name)

        if base_storage is not None and initializer is not None:
            raise LoopyError("temporary variable '%s': "
                             "base_storage and initializer are "
                             "mutually exclusive" % name)

        if base_storage is None and _base_storage_access_may_be_aliasing:
            raise LoopyError(
                "temporary variable '%s': "
                "_base_storage_access_may_be_aliasing option, but no "
                "base_storage given!" % name)

        ArrayBase.__init__(self,
                           name=intern(name),
                           dtype=dtype,
                           shape=shape,
                           strides=strides,
                           dim_tags=dim_tags,
                           offset=offset,
                           dim_names=dim_names,
                           order=order,
                           base_indices=base_indices,
                           address_space=address_space,
                           storage_shape=storage_shape,
                           base_storage=base_storage,
                           initializer=initializer,
                           read_only=read_only,
                           _base_storage_access_may_be_aliasing=(
                               _base_storage_access_may_be_aliasing),
                           **kwargs)
예제 #24
0
 def map_local_hw_index(self, expr, type_context):
     if expr.axis == 0:
         return var("(varying %s) programIndex" % self._get_index_ctype())
     else:
         raise LoopyError("ISPC only supports one local axis")
예제 #25
0
파일: check.py 프로젝트: tj-sun/loopy
    def map_subscript(self, expr):
        WalkMapper.map_subscript(self, expr)

        from pymbolic.primitives import Variable
        assert isinstance(expr.aggregate, Variable)

        shape = None
        var_name = expr.aggregate.name
        if var_name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[var_name]
            shape = arg.shape
        elif var_name in self.kernel.temporary_variables:
            tv = self.kernel.temporary_variables[var_name]
            shape = tv.shape

        if shape is not None:
            subscript = expr.index

            if not isinstance(subscript, tuple):
                subscript = (subscript,)

            from loopy.symbolic import get_dependencies, get_access_range

            available_vars = set(self.domain.get_var_dict())
            shape_deps = set()
            for shape_axis in shape:
                if shape_axis is not None:
                    shape_deps.update(get_dependencies(shape_axis))

            if not (get_dependencies(subscript) <= available_vars
                    and shape_deps <= available_vars):
                return

            if len(subscript) != len(shape):
                raise LoopyError("subscript to '%s' in '%s' has the wrong "
                        "number of indices (got: %d, expected: %d)" % (
                            expr.aggregate.name, expr,
                            len(subscript), len(shape)))

            try:
                access_range = get_access_range(self.domain, subscript,
                        self.kernel.assumptions)
            except isl.Error:
                # Likely: index was non-linear, nothing we can do.
                return
            except TypeError:
                # Likely: index was non-linear, nothing we can do.
                return

            shape_domain = isl.BasicSet.universe(access_range.get_space())
            for idim in range(len(subscript)):
                shape_axis = shape[idim]

                if shape_axis is not None:
                    from loopy.isl_helpers import make_slab
                    slab = make_slab(
                            shape_domain.get_space(), (dim_type.in_, idim),
                            0, shape_axis)

                    shape_domain = shape_domain.intersect(slab)

            if not access_range.is_subset(shape_domain):
                raise LoopyError("'%s' in instruction '%s' "
                        "accesses out-of-bounds array element"
                        % (expr, self.insn_id))
예제 #26
0
def generate_value_arg_setup(kernel, devices, implemented_data_info):
    options = kernel.options

    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False] * len(devices)

    else:
        count_bug_per_dev = [
            has_struct_arg_count_bug(dev) if dev is not None else False
            for dev in devices
        ]

    if any(dev is None for dev in devices):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
             "workarounds for broken OpenCL implementations "
             "(such as those relating to complex numbers) "
             "may not be enabled when needed".format(knl_name=kernel.name))

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    from genpy import (Comment, Line, If, Raise, Assign, Statement as S, Suite)

    result = []
    gen = result.append

    for arg_idx, idi in enumerate(implemented_data_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if not issubclass(idi.arg_class, lp.ValueArg):
            assert issubclass(idi.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        gen(Comment("{{{ process %s" % idi.name))
        gen(Line())

        if not options.skip_arg_checks:
            gen(
                If(
                    "%s is None" % idi.name,
                    Raise('RuntimeError("input argument \'{name}\' '
                          'must be supplied")'.format(name=idi.name))))

        if idi.dtype.is_integral():
            gen(
                Comment("cast to Python int to avoid trouble "
                        "with struct packing or Boost.Python"))
            if sys.version_info < (3, ):
                py_type = "long"
            else:
                py_type = "int"

            gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
            gen(Line())

        if idi.dtype.is_composite():
            gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
            cl_arg_idx += 1

        elif idi.dtype.is_complex():
            assert isinstance(idi.dtype, NumpyType)

            dtype = idi.dtype

            if warn_about_arg_count_bug:
                warn("{knl_name}: arguments include complex numbers, and "
                     "some (but not all) of the target devices mishandle "
                     "struct kernel arguments (hence the workaround is "
                     "disabled".format(knl_name=kernel.name))

            if dtype.numpy_dtype == np.complex64:
                arg_char = "f"
            elif dtype.numpy_dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % dtype)

            if (work_around_arg_count_bug
                    and dtype.numpy_dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                gen(
                    Assign(
                        "_lpy_buf",
                        "_lpy_pack('{arg_char}', {arg_var}.real)".format(
                            arg_char=arg_char, arg_var=idi.name)))
                gen(
                    S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format(
                        cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

                gen(
                    Assign(
                        "_lpy_buf",
                        "_lpy_pack('{arg_char}', {arg_var}.imag)".format(
                            arg_char=arg_char, arg_var=idi.name)))
                gen(
                    S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format(
                        cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1
            else:
                gen(
                    Assign(
                        "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', "
                        "{arg_var}.real, {arg_var}.imag)".format(
                            arg_char=arg_char, arg_var=idi.name)))
                gen(
                    S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format(
                        cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

            fp_arg_count += 2

        elif isinstance(idi.dtype, NumpyType):
            if idi.dtype.dtype.kind == "f":
                fp_arg_count += 1

            gen(
                S("_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" %
                  (cl_arg_idx, idi.dtype.dtype.char, idi.name)))

            cl_arg_idx += 1

        else:
            raise LoopyError("do not know how to pass argument of type '%s'" %
                             idi.dtype)

        gen(Line())

        gen(Comment("}}}"))
        gen(Line())

    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
예제 #27
0
파일: check.py 프로젝트: tj-sun/loopy
def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        from loopy.kernel.instruction import BarrierInstruction
        from loopy.kernel.data import LocalIndexTag
        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            non_lid_inames = frozenset(
                [iname for iname in insn_inames if not isinstance(
                    kernel.iname_to_tag.get(iname), LocalIndexTag)])
            insn_impl_domain = insn_impl_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            desired_domain = desired_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_impl_domain.dim(dim_type.param)))

            lines = []
            for bigger, smaller, diff_set, gist_domain in [
                    ("implemented", "desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired", "implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s but not %s: %s" % (
                            bigger, smaller, ", ".join(point_axes)))
                lines.append(
                        "gist of constraints in %s but not %s: %s" % (
                            smaller, bigger, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.compiled import get_highlighted_cl_code
                print(get_highlighted_cl_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True
예제 #28
0
파일: match.py 프로젝트: connorjward/loopy
def parse_match(expr):
    """Syntax examples::

    * ``id:yoink and writes:a_temp``
    * ``id:yoink and (not writes:a_temp or tag:input)``
    """
    if not expr:
        return All()

    def parse_terminal(pstate):
        next_tag = pstate.next_tag()
        if next_tag is _id:
            result = Id(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        elif next_tag is _tag:
            result = Tagged(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        elif next_tag is _writes:
            result = Writes(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        elif next_tag is _reads:
            result = Reads(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        elif next_tag is _in_kernel:
            result = InKernel(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        elif next_tag is _iname:
            result = Iname(pstate.next_match_obj().group(1))
            pstate.advance()
            return result
        else:
            pstate.expected("terminal")

    def inner_parse(pstate, min_precedence=0):
        pstate.expect_not_end()

        if pstate.is_next(_not):
            pstate.advance()
            left_query = Not(inner_parse(pstate, _PREC_NOT))
        elif pstate.is_next(_openpar):
            pstate.advance()
            left_query = inner_parse(pstate)
            pstate.expect(_closepar)
            pstate.advance()
        else:
            left_query = parse_terminal(pstate)

        did_something = True
        while did_something:
            did_something = False
            if pstate.is_at_end():
                return left_query

            next_tag = pstate.next_tag()

            if next_tag is _and and _PREC_AND > min_precedence:
                pstate.advance()
                left_query = And((left_query, inner_parse(pstate, _PREC_AND)))
                did_something = True
            elif next_tag is _or and _PREC_OR > min_precedence:
                pstate.advance()
                left_query = Or((left_query, inner_parse(pstate, _PREC_OR)))
                did_something = True

        return left_query

    if isinstance(expr, MatchExpressionBase):
        return expr

    from pytools.lex import LexIterator, lex, InvalidTokenError
    try:
        pstate = LexIterator(
            [(tag, s, idx, matchobj)
             for (tag, s, idx,
                  matchobj) in lex(_LEX_TABLE, expr, match_objects=True)
             if tag is not _whitespace], expr)
    except InvalidTokenError as e:
        from loopy.diagnostic import LoopyError
        raise LoopyError(
            "invalid match expression: '{match_expr}' ({err_type}: {err_str})".
            format(match_expr=expr, err_type=type(e).__name__, err_str=str(e)))

    if pstate.is_at_end():
        pstate.raise_parse_error("unexpected end of input")

    result = inner_parse(pstate)
    if not pstate.is_at_end():
        pstate.raise_parse_error("leftover input after completed parse")

    return result
예제 #29
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, AddressSpace
            ecm = codegen_state.expression_to_code_mapper.with_assignments({
                old_val_var:
                TemporaryVariable(old_val_var, lhs_dtype, shape=()),
                new_val_var:
                TemporaryVariable(new_val_var, lhs_dtype, shape=()),
            })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr),
                                prec=PREC_NONE,
                                type_context=rhs_type_context,
                                needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import (TemporaryVariable, ArrayArg)
                if (isinstance(lhs_var, ArrayArg)
                        and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                elif (isinstance(lhs_var, ArrayArg)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                                     "atomic operation: '%s'" %
                                     (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = f"({var_kind} {ctype} *) "

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s" % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                    },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                    ]))
            ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
예제 #30
0
def alias_temporaries(kernel,
                      names,
                      base_name_prefix=None,
                      synchronize_for_exclusive_use=True):
    """Sets all temporaries given by *names* to be backed by a single piece of
    storage.

    :arg synchronize_for_exclusive_use: A :class:`bool`. If ``True``, this also
        introduces ordering structures ("groups") to prevent the usage to ensure
        that the live ranges (i.e. the regions of code where each of the
        temporaries is used) do not overlap. This will allow two (or more)
        temporaries to share the same storage space as long as their live
        ranges do not need to be concurrent.
    :arg base_name_prefix: an identifier to be used for the common storage
        area

    .. versionchanged:: 2016.3

        Added *synchronize_for_exclusive_use* flag.
        ``synchronize_for_exclusive_use=True`` was the previous default
        behavior.
    """
    gng = kernel.get_group_name_generator()
    group_names = [gng("tmpgrp_" + name) for name in names]

    if base_name_prefix is None:
        base_name_prefix = "temp_storage"

    vng = kernel.get_var_name_generator()
    base_name = vng(base_name_prefix)

    names_set = set(names)

    if synchronize_for_exclusive_use:
        new_insns = []
        for insn in kernel.instructions:
            temp_deps = insn.dependency_names() & names_set

            if not temp_deps:
                new_insns.append(insn)
                continue

            if len(temp_deps) > 1:
                raise LoopyError(
                    "Instruction {insn} refers to multiple of the "
                    "temporaries being aliased, namely '{temps}'. Cannot alias."
                    .format(insn=insn.id, temps=", ".join(temp_deps)))

            temp_name, = temp_deps
            temp_idx = names.index(temp_name)
            group_name = group_names[temp_idx]
            other_group_names = (frozenset(group_names[:temp_idx])
                                 | frozenset(group_names[temp_idx + 1:]))

            new_insns.append(
                insn.copy(groups=insn.groups | frozenset([group_name]),
                          conflicts_with_groups=(insn.conflicts_with_groups
                                                 | other_group_names)))
    else:
        new_insns = kernel.instructions

    new_temporary_variables = {}
    for tv in kernel.temporary_variables.values():
        if tv.name in names_set:
            if tv.base_storage is not None:
                raise LoopyError(
                    "temporary variable '{tv}' already has "
                    "a defined storage array -- cannot alias".format(
                        tv=tv.name))

            new_temporary_variables[tv.name] = \
                    tv.copy(base_storage=base_name)
        else:
            new_temporary_variables[tv.name] = tv

    return kernel.copy(instructions=new_insns,
                       temporary_variables=new_temporary_variables)