Exemplo n.º 1
0
    def split_access_axis(expr):
        axis_nr, order = array_to_rest[expr.aggregate.name]

        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx,)
        idx = list(idx)

        axis_idx = idx[axis_nr]

        if auto_split_inames:
            from pymbolic.primitives import Variable
            if not isinstance(axis_idx, Variable):
                raise RuntimeError("found access '%s' in which axis %d is not a "
                        "single variable--cannot split "
                        "(Have you tried to do the split yourself, manually, "
                        "beforehand? If so, you shouldn't.)"
                        % (expr, axis_nr))

            split_iname = idx[axis_nr].name
            assert split_iname in kernel.all_inames()

            try:
                outer_iname, inner_iname = split_vars[split_iname]
            except KeyError:
                outer_iname = var_name_gen(split_iname+"_outer")
                inner_iname = var_name_gen(split_iname+"_inner")
                split_vars[split_iname] = outer_iname, inner_iname

            inner_index = Variable(inner_iname)
            outer_index = Variable(outer_iname)

        else:
            from loopy.symbolic import simplify_using_aff
            inner_index = simplify_using_aff(kernel, axis_idx % count)
            outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis+1, outer_index)
        elif order == "C":
            idx.insert(axis, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))
Exemplo n.º 2
0
    def split_access_axis(expr):
        axis_nr, order = array_to_rest[expr.aggregate.name]

        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx, )
        idx = list(idx)

        axis_idx = idx[axis_nr]

        if auto_split_inames:
            from pymbolic.primitives import Variable
            if not isinstance(axis_idx, Variable):
                raise RuntimeError(
                    "found access '%s' in which axis %d is not a "
                    "single variable--cannot split "
                    "(Have you tried to do the split yourself, manually, "
                    "beforehand? If so, you shouldn't.)" % (expr, axis_nr))

            split_iname = idx[axis_nr].name
            assert split_iname in kernel.all_inames()

            try:
                outer_iname, inner_iname = split_vars[split_iname]
            except KeyError:
                outer_iname = var_name_gen(split_iname + "_outer")
                inner_iname = var_name_gen(split_iname + "_inner")
                split_vars[split_iname] = outer_iname, inner_iname

            inner_index = Variable(inner_iname)
            outer_index = Variable(outer_iname)

        else:
            from loopy.symbolic import simplify_using_aff
            inner_index = simplify_using_aff(kernel, axis_idx % count)
            outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis + 1, outer_index)
        elif order == "C":
            idx.insert(axis, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))
Exemplo n.º 3
0
    def map_subscript(self, expr, expn_state):
        from loopy.symbolic import simplify_using_aff
        from pymbolic.primitives import Subscript

        new_indices = tuple(
            simplify_using_aff(self.kernel, self.rec(idx, expn_state))
            for idx in expr.index_tuple)

        return Subscript(self.rec(expr.aggregate, expn_state), new_indices)
Exemplo n.º 4
0
def get_arg_descriptor_for_expression(kernel, expr):
    """
    :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor`
        describing the argument expression *expr* which occurs
        in a call in the code of *kernel*.
    """
    from loopy.symbolic import (SubArrayRef, pw_aff_to_expr,
                                SweptInameStrideCollector)
    from loopy.kernel.data import TemporaryVariable, ArrayArg

    if isinstance(expr, SubArrayRef):
        name = expr.subscript.aggregate.name
        arg = kernel.get_var_descriptor(name)

        if not isinstance(arg, (TemporaryVariable, ArrayArg)):
            raise LoopyError("unsupported argument type "
                             "'%s' of '%s' in call statement" %
                             (type(arg).__name__, expr.name))

        aspace = arg.address_space

        from loopy.kernel.array import FixedStrideArrayDimTag as DimTag
        sub_dim_tags = []
        sub_shape = []

        # This helps in identifying identities like
        # "2*(i//2) + i%2" := "i"
        # See the kernel in
        # test_callables.py::test_shape_translation_through_sub_array_refs

        from loopy.symbolic import simplify_using_aff
        linearized_index = simplify_using_aff(
            kernel,
            sum(dim_tag.stride * iname for dim_tag, iname in zip(
                arg.dim_tags, expr.subscript.index_tuple)))

        strides_as_dict = SweptInameStrideCollector(
            tuple(iname.name for iname in expr.swept_inames))(linearized_index)
        sub_dim_tags = tuple(
            # Not all swept inames necessarily occur in the expression.
            DimTag(strides_as_dict.get(iname, 0))
            for iname in expr.swept_inames)
        sub_shape = tuple(
            pw_aff_to_expr(
                kernel.get_iname_bounds(iname.name).upper_bound_pw_aff -
                kernel.get_iname_bounds(iname.name).lower_bound_pw_aff) + 1
            for iname in expr.swept_inames)

        return ArrayArgDescriptor(address_space=aspace,
                                  dim_tags=sub_dim_tags,
                                  shape=sub_shape)
    else:
        ExpressionIsScalarChecker(kernel)(expr)
        return ValueArgDescriptor()
Exemplo n.º 5
0
    def map_variable(self, expr, type_context):
        from loopy.kernel.data import ValueArg, AddressSpace

        def postproc(x):
            return x

        if expr.name in self.codegen_state.var_subst_map:
            if self.kernel.options.annotate_inames:
                return var("/* {} */ {}".format(
                    expr.name,
                    self.rec(self.codegen_state.var_subst_map[expr.name],
                             type_context)))
            else:
                return self.rec(self.codegen_state.var_subst_map[expr.name],
                                type_context)
        elif expr.name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[expr.name]
            from loopy.kernel.array import ArrayBase
            if isinstance(arg, ArrayBase):
                if arg.shape == ():
                    if arg.offset:

                        from loopy.kernel.array import _apply_offset
                        from loopy.symbolic import simplify_using_aff

                        subscript = _apply_offset(0, expr.name, arg)
                        result = self.make_subscript(
                            arg, var(expr.name),
                            simplify_using_aff(self.kernel,
                                               self.rec(subscript, "i")))
                        return result
                    else:
                        return var(expr.name)[0]
                else:
                    raise RuntimeError(
                        "unsubscripted reference to array '%s'" % expr.name)

            if isinstance(arg, ValueArg) and self.fortran_abi:
                postproc = lambda x: x[0]  # noqa
        elif expr.name in self.kernel.temporary_variables:
            temporary = self.kernel.temporary_variables[expr.name]
            if (temporary.base_storage
                    or temporary.address_space == AddressSpace.GLOBAL):
                postproc = lambda x: x[0]  # noqa

        result = self.kernel.mangle_symbol(self.codegen_state.ast_builder,
                                           expr.name)
        if result is not None:
            _, c_name = result
            return postproc(var(c_name))

        return postproc(var(expr.name))
Exemplo n.º 6
0
    def split_access_axis(expr):
        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx,)
        idx = list(idx)

        axis_idx = idx[axis_nr]

        from loopy.symbolic import simplify_using_aff
        inner_index = simplify_using_aff(kernel, axis_idx % count)
        outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis_nr+1, outer_index)
        elif order == "C":
            idx.insert(axis_nr, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))
Exemplo n.º 7
0
    def split_access_axis(expr):
        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx, )
        idx = list(idx)

        axis_idx = idx[axis_nr]

        from loopy.symbolic import simplify_using_aff
        inner_index = simplify_using_aff(kernel, axis_idx % count)
        outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis_nr + 1, outer_index)
        elif order == "C":
            idx.insert(axis_nr, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))
Exemplo n.º 8
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        def base_impl(expr, enclosing_prec, type_context):
            return self.parenthesize_if_needed(
                "%s[%s]" % (self.rec(expr.aggregate, PREC_CALL, type_context),
                            self.rec(expr.index, PREC_NONE, 'i')),
                enclosing_prec, PREC_CALL)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, enclosing_prec, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
            simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(
            self.kernel.target, ary, index_tuple,
            lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
            self.codegen_state.vectorization_info)

        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable

        if isinstance(ary, ImageArg):
            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" %
                           (ary.name, ary.dimensions, ", ".join(
                               self.rec(idx, PREC_NONE, 'i')
                               for idx in expr.index[::-1])))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access + ".x"
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return "as_double(%s.xy)" % base_access
            else:
                raise NotImplementedError(
                    "non-floating-point images not supported for now")

        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
            if len(access_info.subscripts) == 0:
                if isinstance(ary, GlobalArg):
                    # unsubscripted global args are pointers
                    result = "*" + access_info.array_name

                else:
                    # unsubscripted temp vars are scalars
                    result = access_info.array_name

            else:
                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                    "%s[%s]" % (access_info.array_name,
                                self.rec(subscript, PREC_NONE, 'i')),
                    enclosing_prec, PREC_CALL)

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
Exemplo n.º 9
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression,
                       prec=PREC_NONE,
                       type_context=rhs_type_context,
                       needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(
                kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                codegen_state.vectorization_info)

            from loopy.kernel.data import GlobalArg, TemporaryVariable

            if not isinstance(ary, (GlobalArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s" %
                                 type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children, )

            new_terms = []

            from loopy.kernel.data import LocalIndexTag
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable) and isinstance(
                        kernel.iname_to_tag.get(term.name), LocalIndexTag)
                        and kernel.iname_to_tag.get(term.name).axis == 0):
                    if saw_l0:
                        raise LoopyError("streaming store must have stride 1 "
                                         "in local index, got: %s" % subscript)
                    saw_l0 = True
                    continue
                else:
                    for dep in get_dependencies(term):
                        if (isinstance(kernel.iname_to_tag.get(dep),
                                       LocalIndexTag)
                                and kernel.iname_to_tag.get(dep).axis == 0):
                            raise LoopyError(
                                "streaming store must have stride 1 "
                                "in local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                                 "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                                 "data type")

            rhs_has_programindex = any(
                isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag)
                and kernel.iname_to_tag.get(dep).axis == 0
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                "streaming_store(%s + %s, %s)" %
                (access_info.array_name,
                 ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
Exemplo n.º 10
0
    def map_subscript(self, expr, type_context):
        def base_impl(expr, type_context):
            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]

        def make_var(name):
            from loopy import TaggedVariable
            if isinstance(expr.aggregate, TaggedVariable):
                return TaggedVariable(name, expr.aggregate.tag)
            else:
                return var(name)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
                simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(self.kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        from loopy.kernel.data import (
                ImageArg, ArrayArg, TemporaryVariable, ConstantArg)

        if isinstance(ary, ImageArg):
            extra_axes = 0

            num_target_axes = ary.num_target_axes()
            if num_target_axes in [1, 2]:
                idx_vec_type = "float2"
                extra_axes = 2-num_target_axes
            elif num_target_axes == 3:
                idx_vec_type = "float4"
                extra_axes = 4-num_target_axes
            else:
                raise LoopyError("unsupported number (%d) of target axes in image"
                        % num_target_axes)

            idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes

            base_access = var("read_imagef")(
                    var(ary.name),
                    var("loopy_sampler"),
                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i')))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access.attr("x")
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return var("as_double")(base_access.attr("xy"))
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)):
            if len(access_info.subscripts) == 0:
                if (
                        (isinstance(ary, (ConstantArg, ArrayArg)) or
                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                    # unsubscripted global args are pointers
                    result = make_var(access_info.array_name)[0]

                else:
                    # unsubscripted temp vars are scalars
                    # (unless they use base_storage)
                    result = make_var(access_info.array_name)

            else:
                subscript, = access_info.subscripts
                result = make_var(access_info.array_name)[simplify_using_aff(
                    self.kernel, self.rec(subscript, 'i'))]

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
Exemplo n.º 11
0
    def map_subscript(self, expr):
        name = expr.aggregate.name  # name of array

        if name in self.knl.arg_dict:
            array = self.knl.arg_dict[name]
        else:
            # this is a temporary variable
            return self.rec(expr.index)

        if not isinstance(array, lp.GlobalArg):
            # this array is not in global memory
            return self.rec(expr.index)

        index = expr.index  # could be tuple or scalar index
        if not isinstance(index, tuple):
            index = (index,)

        from loopy.symbolic import get_dependencies
        from loopy.kernel.data import LocalIndexTag
        my_inames = get_dependencies(index) & self.knl.all_inames()
        local_id0 = None
        local_id_found = False
        for iname in my_inames:
            # find local id0
            tag = self.knl.iname_to_tag.get(iname)
            if isinstance(tag, LocalIndexTag):
                local_id_found = True
                if tag.axis == 0:
                    local_id0 = iname
                    break  # there will be only one local_id0

        if not local_id_found:
            # count as uniform access
            return ToCountMap(
                    {(self.type_inf(expr), 'uniform'): 1}
                    ) + self.rec(expr.index)

        if local_id0 is None:
            # only non-zero local id(s) found, assume non-consecutive access
            return ToCountMap(
                    {(self.type_inf(expr), 'nonconsecutive'): 1}
                    ) + self.rec(expr.index)

        # check coefficient of local_id0 for each axis
        from loopy.symbolic import CoefficientCollector
        from pymbolic.primitives import Variable
        for idx, axis_tag in zip(index, array.dim_tags):

            from loopy.symbolic import simplify_using_aff
            coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx))
            # check if he contains the lid 0 guy
            try:
                coeff_id0 = coeffs[Variable(local_id0)]
            except KeyError:
                # does not contain local_id0
                continue

            if coeff_id0 != 1:
                # non-consecutive access
                return ToCountMap(
                        {(self.type_inf(expr), 'nonconsecutive'): 1}
                        ) + self.rec(expr.index)

            # coefficient is 1, now determine if stride is 1
            from loopy.kernel.array import FixedStrideArrayDimTag
            if isinstance(axis_tag, FixedStrideArrayDimTag):
                stride = axis_tag.stride
            else:
                continue

            if stride != 1:
                # non-consecutive
                return ToCountMap(
                        {(self.type_inf(expr), 'nonconsecutive'): 1}
                        ) + self.rec(expr.index)

            # else, stride == 1, continue since another idx could contain id0

        # loop finished without returning, stride==1 for every instance of local_id0
        return ToCountMap(
                {(self.type_inf(expr), 'consecutive'): 1}
                ) + self.rec(expr.index)
Exemplo n.º 12
0
    def map_subscript(self, expr, enclosing_prec, type_context):
        def base_impl(expr, enclosing_prec, type_context):
            return self.parenthesize_if_needed(
                    "%s[%s]" % (
                        self.rec(expr.aggregate, PREC_CALL, type_context),
                        self.rec(expr.index, PREC_NONE, 'i')),
                    enclosing_prec, PREC_CALL)

        from pymbolic.primitives import Variable
        if not isinstance(expr.aggregate, Variable):
            return base_impl(expr, enclosing_prec, type_context)

        ary = self.find_array(expr)

        from loopy.kernel.array import get_access_info
        from pymbolic import evaluate

        from loopy.symbolic import simplify_using_aff
        index_tuple = tuple(
                simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)

        access_info = get_access_info(self.kernel.target, ary, index_tuple,
                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                self.codegen_state.vectorization_info)

        from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable

        if isinstance(ary, ImageArg):
            base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))"
                    % (ary.name, ary.dimensions,
                        ", ".join(self.rec(idx, PREC_NONE, 'i')
                            for idx in expr.index[::-1])))

            if ary.dtype.numpy_dtype == np.float32:
                return base_access+".x"
            if self.kernel.target.is_vector_dtype(ary.dtype):
                return base_access
            elif ary.dtype.numpy_dtype == np.float64:
                return "as_double(%s.xy)" % base_access
            else:
                raise NotImplementedError(
                        "non-floating-point images not supported for now")

        elif isinstance(ary, (GlobalArg, TemporaryVariable)):
            if len(access_info.subscripts) == 0:
                if isinstance(ary, GlobalArg):
                    # unsubscripted global args are pointers
                    result = "*" + access_info.array_name

                else:
                    # unsubscripted temp vars are scalars
                    result = access_info.array_name

            else:
                subscript, = access_info.subscripts
                result = self.parenthesize_if_needed(
                        "%s[%s]" % (
                            access_info.array_name,
                            self.rec(subscript, PREC_NONE, 'i')),
                        enclosing_prec, PREC_CALL)

            if access_info.vector_index is not None:
                return self.codegen_state.ast_builder.add_vector_access(
                    result, access_info.vector_index)
            else:
                return result

        else:
            assert False
Exemplo n.º 13
0
    def emit_assignment(self, codegen_state, insn):
        kernel = codegen_state.kernel
        ecm = codegen_state.expression_to_code_mapper

        assignee_var_name, = insn.assignee_var_names()

        lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
        lhs_dtype = lhs_var.dtype

        if insn.atomicity:
            raise NotImplementedError("atomic ops in ISPC")

        from loopy.expression import dtype_to_type_context
        from pymbolic.mapper.stringifier import PREC_NONE

        rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
        rhs_code = ecm(insn.expression, prec=PREC_NONE,
                    type_context=rhs_type_context,
                    needed_dtype=lhs_dtype)

        lhs = insn.assignee

        # {{{ handle streaming stores

        if "!streaming_store" in insn.tags:
            ary = ecm.find_array(lhs)

            from loopy.kernel.array import get_access_info
            from pymbolic import evaluate

            from loopy.symbolic import simplify_using_aff
            index_tuple = tuple(
                    simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)

            access_info = get_access_info(kernel.target, ary, index_tuple,
                    lambda expr: evaluate(expr, codegen_state.var_subst_map),
                    codegen_state.vectorization_info)

            from loopy.kernel.data import ArrayArg, TemporaryVariable

            if not isinstance(ary, (ArrayArg, TemporaryVariable)):
                raise LoopyError("array type not supported in ISPC: %s"
                        % type(ary).__name)

            if len(access_info.subscripts) != 1:
                raise LoopyError("streaming stores must have a subscript")
            subscript, = access_info.subscripts

            from pymbolic.primitives import Sum, flattened_sum, Variable
            if isinstance(subscript, Sum):
                terms = subscript.children
            else:
                terms = (subscript.children,)

            new_terms = []

            from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type
            from loopy.symbolic import get_dependencies

            saw_l0 = False
            for term in terms:
                if (isinstance(term, Variable)
                            and kernel.iname_tags_of_type(term.name, LocalIndexTag)):
                    tag, = kernel.iname_tags_of_type(
                        term.name, LocalIndexTag, min_num=1, max_num=1)
                    if tag.axis == 0:
                        if saw_l0:
                            raise LoopyError(
                                "streaming store must have stride 1 in "
                                "local index, got: %s" % subscript)
                        saw_l0 = True
                        continue
                else:
                    for dep in get_dependencies(term):
                        if filter_iname_tags_by_type(
                                kernel.iname_to_tags.get(dep, []), LocalIndexTag):
                            tag, = filter_iname_tags_by_type(
                                kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1)
                            if tag.axis == 0:
                                raise LoopyError(
                                    "streaming store must have stride 1 in "
                                    "local index, got: %s" % subscript)

                    new_terms.append(term)

            if not saw_l0:
                raise LoopyError("streaming store must have stride 1 in "
                        "local index, got: %s" % subscript)

            if access_info.vector_index is not None:
                raise LoopyError("streaming store may not use a short-vector "
                        "data type")

            rhs_has_programindex = any(
                isinstance(tag, LocalIndexTag) and tag.axis == 0
                for tag in kernel.iname_tags(dep)
                for dep in get_dependencies(insn.expression))

            if not rhs_has_programindex:
                rhs_code = "broadcast(%s, 0)" % rhs_code

            from cgen import Statement
            return Statement(
                    "streaming_store(%s + %s, %s)"
                    % (
                        access_info.array_name,
                        ecm(flattened_sum(new_terms), PREC_NONE, 'i'),
                        rhs_code))

        # }}}

        from cgen import Assign
        return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)