Пример #1
0
    def has_barrier_within(self):
        """
        Returns an instance of :class:`list`. The list's i-th entry is *True* if the
        i-th schedule item is a :class:`loopy.schedule.BeginBlockItem` containing a
        barrier or if the i-th schedule item is a :class:`loopy.schedule.Barrier`.
        """
        has_barrier_within = []

        for sched_idx, sched_item in enumerate(self.kernel_proxy.schedule):
            if isinstance(sched_item, BeginBlockItem):
                # TODO: calls to "gather_schedule_block" can be amortized
                _, endblock_index = gather_schedule_block(
                    self.kernel_proxy.schedule, sched_idx)
                has_barrier_within.append(
                    any(
                        isinstance(self.kernel_proxy.schedule[i], Barrier)
                        for i in range(sched_idx + 1, endblock_index)))
            elif isinstance(sched_item, Barrier):
                has_barrier_within.append(True)
            else:
                has_barrier_within.append(False)

        return has_barrier_within
Пример #2
0
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block,
                                    get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
            is_generating_device_code=True,
            gen_program_name=sched_item.kernel_name,
            schedule_index_end=past_end_i - 1,
            implemented_data_info=(codegen_state.implemented_data_info +
                                   extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
            new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,
            codegen_state.ast_builder.get_kernel_call(
                codegen_state, sched_item.kernel_name, glob_grid, loc_grid,
                extra_args),
        ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (generate_unroll_loop,
                                        generate_vectorize_loop,
                                        generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                                       ForceSequentialTag, LoopedIlpTag,
                                       VectorizeTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag,
                                       (LoopedIlpTag, ForceSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                               "for '%s', tagged '%s'" %
                               (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        return codegen_state.ast_builder.emit_barrier(sched_item.kind,
                                                      sched_item.comment)

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
            "instruction %s" % insn.id,
            lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s" %
                           type(sched_item))
Пример #3
0
def build_loop_nest(codegen_state, schedule_index):
    # Most of the complexity of this function goes towards finding groups of
    # instructions that can be nested inside a shared conditional.

    kernel = codegen_state.kernel

    # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices

    # i.e. go up to the next LeaveLoop, and skip over inner loops.

    my_sched_indices = []

    i = schedule_index
    while i < codegen_state.schedule_index_end:
        sched_item = kernel.schedule[i]

        if isinstance(sched_item, LeaveLoop):
            break

        my_sched_indices.append(i)

        if isinstance(sched_item, (EnterLoop, CallKernel)):
            _, i = gather_schedule_block(kernel.schedule, i)
            assert i <= codegen_state.schedule_index_end, \
                    "schedule block extends beyond schedule_index_end"

        elif isinstance(sched_item, Barrier):
            i += 1

        elif isinstance(sched_item, RunInstruction):
            i += 1
        else:
            raise RuntimeError("unexpected schedule item type: %s" %
                               type(sched_item))

    del i

    # }}}

    # {{{ pass 2: find admissible conditional inames for each sibling schedule item

    from pytools import Record

    class ScheduleIndexInfo(Record):
        """
        .. attribute:: schedule_index
        .. attribute:: admissible_cond_inames
        .. attribute:: required_predicates
        .. attribute:: used_inames_within
        """

    from loopy.schedule import find_used_inames_within
    sched_index_info_entries = [
        ScheduleIndexInfo(
            schedule_indices=[i],
            admissible_cond_inames=(get_admissible_conditional_inames_for(
                codegen_state, i)),
            required_predicates=get_required_predicates(kernel, i),
            used_inames_within=find_used_inames_within(kernel, i))
        for i in my_sched_indices
    ]

    sched_index_info_entries = group_by(
        sched_index_info_entries,
        key=lambda sii: (sii.admissible_cond_inames, sii.required_predicates,
                         sii.used_inames_within),
        merge=lambda sii1, sii2: sii1.copy(schedule_indices=(
            sii1.schedule_indices + sii2.schedule_indices)))

    # }}}

    # {{{ pass 3: greedily group schedule items that share admissible inames

    from pytools import memoize_method

    class BoundsCheckCache:
        def __init__(self, kernel, impl_domain):
            self.kernel = kernel
            self.impl_domain = impl_domain

        @memoize_method
        def __call__(self, check_inames):
            if not check_inames:
                return []

            domain = isl.align_spaces(
                self.kernel.get_inames_domain(check_inames),
                self.impl_domain,
                obj_bigger_ok=True)
            from loopy.codegen.bounds import get_bounds_checks
            return get_bounds_checks(
                domain,
                check_inames,
                self.impl_domain,

                # Each instruction individually gets its bounds checks,
                # so we can safely overapproximate here.
                overapproximate=True)

    def build_insn_group(sched_index_info_entries,
                         codegen_state,
                         done_group_lengths=set()):
        """
        :arg done_group_lengths: A set of group lengths (integers) that grows
            from empty to include the longest found group and downwards with every
            recursive call.  It serves to prevent infinite recursion by preventing
            recursive calls from doing anything about groups that are too small.
        """

        # The rough plan here is that build_insn_group starts out with the
        # entirety of the current schedule item's downward siblings (i.e. all
        # the ones up to the next LeaveLoop). It will then iterate upward to
        # find the largest usable conditional hoist group.
        #
        # It will then call itself recursively, telling its recursive instances
        # to ignore the hoist group it just found by adding that group length
        # to done_group_length. (It'll also chop the set of schedule indices
        # considered down so that a callee cannot find a *longer* hoist group.)
        #
        # Upon return the hoist is wrapped around the returned code and
        # build_insn_group calls itself for the remainder of schedule indices
        # that were not in the hoist group.

        if not sched_index_info_entries:
            return []

        origin_si_entry = sched_index_info_entries[0]
        current_iname_set = origin_si_entry.admissible_cond_inames
        current_pred_set = (origin_si_entry.required_predicates -
                            codegen_state.implemented_predicates)

        # {{{ grow schedule item group

        # Keep growing schedule item group as long as group fulfills minimum
        # size requirement.

        bounds_check_cache = BoundsCheckCache(kernel,
                                              codegen_state.implemented_domain)

        found_hoists = []

        candidate_group_length = 1
        while candidate_group_length <= len(sched_index_info_entries):
            if candidate_group_length in done_group_lengths:
                candidate_group_length += 1
                continue

            current_iname_set = (
                current_iname_set
                & sched_index_info_entries[candidate_group_length -
                                           1].admissible_cond_inames)
            current_pred_set = (
                current_pred_set
                & sched_index_info_entries[candidate_group_length -
                                           1].required_predicates)

            # {{{ see which inames are actually used in group

            # And only generate conditionals for those.
            used_inames = set()
            for sched_index_info_entry in \
                    sched_index_info_entries[0:candidate_group_length]:
                used_inames |= sched_index_info_entry.used_inames_within

            # }}}

            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
                current_iname_set & used_inames)

            bounds_checks = bounds_check_cache(only_unshared_inames)

            if (bounds_checks  # found a bounds check
                    or current_pred_set or candidate_group_length == 1):
                # length-1 must always be an option to reach the recursion base
                # case below
                found_hoists.append(
                    (candidate_group_length, bounds_checks, current_pred_set))

            if not bounds_checks and not current_pred_set:
                # already no more checks possible, let's not waste time
                # checking longer groups.
                break

            candidate_group_length += 1

        # }}}

        # pick largest such group
        group_length, bounds_checks, pred_checks = max(found_hoists)

        check_set = None
        for cns in bounds_checks:
            cns_set = (isl.BasicSet.universe(
                cns.get_space()).add_constraint(cns))

            if check_set is None:
                check_set = cns_set
            else:
                check_set, cns_set = isl.align_two(check_set, cns_set)
                check_set = check_set.intersect(cns_set)

        if check_set is None:
            new_codegen_state = codegen_state
            is_empty = False
        else:
            is_empty = check_set.is_empty()
            new_codegen_state = codegen_state.intersect(check_set)

        if pred_checks:
            new_codegen_state = new_codegen_state.copy(
                implemented_predicates=new_codegen_state.implemented_predicates
                | pred_checks)

        if is_empty:
            result = []
        else:
            if group_length == 1:
                # group only contains starting schedule item
                def gen_code(inner_codegen_state):
                    result = []
                    for i in origin_si_entry.schedule_indices:
                        inner = generate_code_for_sched_index(
                            inner_codegen_state, i)

                        if inner is not None:
                            result.append(inner)

                    return result

            else:
                # recurse with a bigger done_group_lengths
                def gen_code(inner_codegen_state):
                    return build_insn_group(
                        sched_index_info_entries[0:group_length],
                        inner_codegen_state,
                        done_group_lengths=(done_group_lengths
                                            | set([group_length])))

            # gen_code returns a list

            if bounds_checks or pred_checks:
                from loopy.symbolic import constraint_to_expr

                prev_gen_code = gen_code

                def gen_code(inner_codegen_state):
                    condition_exprs = [
                        constraint_to_expr(cns) for cns in bounds_checks
                    ] + [pred_chk for pred_chk in pred_checks]

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [
                        wrap_in_if(
                            inner_codegen_state, condition_exprs,
                            merge_codegen_results(codegen_state, prev_result))
                    ]

                cannot_vectorize = False
                if new_codegen_state.vectorization_info is not None:
                    from loopy.isl_helpers import obj_involves_variable
                    for cond in bounds_checks:
                        if obj_involves_variable(
                                cond,
                                new_codegen_state.vectorization_info.iname):
                            cannot_vectorize = True
                            break

                if cannot_vectorize:

                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code(inner_codegen_state)

                    result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                else:
                    result = gen_code(new_codegen_state)

            else:
                result = gen_code(new_codegen_state)

        return result + build_insn_group(
            sched_index_info_entries[group_length:], codegen_state)

    # }}}

    insn_group = build_insn_group(sched_index_info_entries, codegen_state)
    return merge_codegen_results(codegen_state, insn_group)
Пример #4
0
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
                        GroupIndexTag)

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
                gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1)
                altags = kernel.iname_tags_of_type(
                        iname, AutoLocalIndexTagBase, max_num=1)

                if ltags:
                    tag, = ltags
                    local_axes_used.add(tag.axis)
                elif gtags:
                    tag, = gtags
                    group_axes_used.add(tag.axis)
                elif altags:
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
Пример #5
0
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop,
                                LeaveLoop, ReturnFromKernel,
                                get_insn_ids_for_block_at,
                                gather_schedule_block)

    boostable_insn_ids = _find_boostable_insn_ids(kernel)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = {ax for ax, length in enumerate(group_size)}
        local_axes = {ax for ax, length in enumerate(local_size)}

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
                                   GroupIndexTag)

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            group_axes_used = set()
            local_axes_used = set()

            for iname in insn.within_inames:
                ltags = kernel.iname_tags_of_type(iname,
                                                  LocalIndexTag,
                                                  max_num=1)
                gtags = kernel.iname_tags_of_type(iname,
                                                  GroupIndexTag,
                                                  max_num=1)
                altags = kernel.iname_tags_of_type(iname,
                                                   AutoLocalIndexTagBase,
                                                   max_num=1)

                if ltags:
                    tag, = ltags
                    local_axes_used.add(tag.axis)
                elif gtags:
                    tag, = gtags
                    group_axes_used.add(tag.axis)
                elif altags:
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                if insn.id in boostable_insn_ids:
                    warn("instruction '%s' does not use all group hw axes"
                         " (available: %s used:%s). Loopy will generate code"
                         " with the instruction executed along all the"
                         " missing hw axes. This will result in an"
                         " error from 2021.x onwards, calling"
                         " loopy.add_inames_for_unused_hw_axes(...)"
                         " might help in the transition." %
                         (insn.id, ",".join(str(i) for i in group_axes),
                          ",".join(str(i) for i in group_axes_used)),
                         DeprecationWarning,
                         stacklevel=2)
                else:
                    raise LoopyError(
                        "instruction '%s' does not use all group"
                        " hw axes (available: %s used:%s)" %
                        (insn.id, ",".join(str(i) for i in group_axes),
                         ",".join(str(i) for i in group_axes_used)))

            if local_axes != local_axes_used:
                if insn.id in boostable_insn_ids:
                    warn("instruction '%s' does not use all local hw axes"
                         " (available: %s used:%s). Loopy will generate code"
                         " with the instruction executed along all the"
                         " missing hw axes. This will result in an"
                         " error from 2021.x onwards, calling"
                         " loopy.add_inames_for_unused_hw_axes(...)"
                         " might help in the transition." %
                         (insn.id, ",".join(str(i) for i in local_axes),
                          ",".join(str(i) for i in local_axes_used)),
                         DeprecationWarning,
                         stacklevel=2)
                else:
                    raise LoopyError(
                        "instruction '%s' does not use all local"
                        " hw axes (available: %s used:%s)" %
                        (insn.id, ",".join(str(i) for i in local_axes),
                         ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError("schedule item not understood: %s" %
                            type(sched_item).__name__)

    return past_end_i
Пример #6
0
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block,
                                    get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
            is_generating_device_code=True,
            gen_program_name=sched_item.kernel_name,
            schedule_index_end=past_end_i - 1,
            implemented_data_info=(codegen_state.implemented_data_info +
                                   extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
            new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,
            codegen_state.ast_builder.get_kernel_call(
                codegen_state, sched_item.kernel_name, glob_grid, loc_grid,
                extra_args),
        ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (generate_unroll_loop,
                                        generate_vectorize_loop,
                                        generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                                       ForceSequentialTag, LoopedIlpTag,
                                       VectorizeTag,
                                       InOrderSequentialSequentialTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag,
                                             InOrderSequentialSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                               "for '%s', tagged '%s'" %
                               (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        # {{{ emit barrier code

        from loopy.codegen.result import CodeGenerationResult

        if codegen_state.is_generating_device_code:
            barrier_ast = codegen_state.ast_builder.emit_barrier(
                sched_item.synchronization_kind, sched_item.mem_kind,
                sched_item.comment)
            if sched_item.originating_insn_id:
                return CodeGenerationResult.new(
                    codegen_state, sched_item.originating_insn_id, barrier_ast,
                    codegen_state.implemented_domain)
            else:
                return barrier_ast
        else:
            # host code
            if sched_item.synchronization_kind in ["global", "local"]:
                # host code is assumed globally and locally synchronous
                return CodeGenerationResult(
                    host_program=None,
                    device_programs=[],
                    implemented_domains={},
                    implemented_data_info=codegen_state.implemented_data_info)

            else:
                raise LoopyError("do not know how to emit code for barrier "
                                 "synchronization kind '%s'"
                                 "in host code" %
                                 sched_item.synchronization_kind)

        # }}}

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
            "instruction %s" % insn.id,
            lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s" %
                           type(sched_item))
Пример #7
0
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                tag = kernel.iname_to_tag.get(iname)

                if isinstance(tag, LocalIndexTag):
                    local_axes_used.add(tag.axis)
                elif isinstance(tag, GroupIndexTag):
                    group_axes_used.add(tag.axis)
                elif isinstance(tag, AutoLocalIndexTagBase):
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
Пример #8
0
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
                is_generating_device_code=True,
                gen_program_name=sched_item.kernel_name,
                schedule_index_end=past_end_i-1,
                implemented_data_info=(codegen_state.implemented_data_info
                    + extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
                new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,

            codegen_state.ast_builder.get_kernel_call(
                codegen_state,
                sched_item.kernel_name,
                glob_grid, loc_grid,
                extra_args),
            ])

    elif isinstance(sched_item, EnterLoop):
        tags = kernel.iname_tags(sched_item.iname)
        tags = tuple(tag for tag in tags if tag)

        from loopy.codegen.loop import (
                generate_unroll_loop,
                generate_vectorize_loop,
                generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                ForceSequentialTag, LoopedIlpTag, VectorizeTag,
                InOrderSequentialSequentialTag, filter_iname_tags_by_type)
        if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif filter_iname_tags_by_type(tags, VectorizeTag):
            func = generate_vectorize_loop
        elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag,
                    ForceSequentialTag, InOrderSequentialSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                    "for '%s', tagged '%s'"
                    % (sched_item.iname, ", ".join(str(tag) for tag in tags)))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        # {{{ emit barrier code

        from loopy.codegen.result import CodeGenerationResult

        if codegen_state.is_generating_device_code:
            barrier_ast = codegen_state.ast_builder.emit_barrier(
                    sched_item.synchronization_kind, sched_item.mem_kind,
                    sched_item.comment)
            if sched_item.originating_insn_id:
                return CodeGenerationResult.new(
                        codegen_state,
                        sched_item.originating_insn_id,
                        barrier_ast,
                        codegen_state.implemented_domain)
            else:
                return barrier_ast
        else:
            # host code
            if sched_item.synchronization_kind in ["global", "local"]:
                # host code is assumed globally and locally synchronous
                return CodeGenerationResult(
                        host_program=None,
                        device_programs=[],
                        implemented_domains={},
                        implemented_data_info=codegen_state.implemented_data_info)

            else:
                raise LoopyError("do not know how to emit code for barrier "
                                 "synchronization kind '%s'" "in host code"
                                 % sched_item.synchronization_kind)

        # }}}

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
                "instruction %s" % insn.id,
                lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s"
                % type(sched_item))
Пример #9
0
def build_loop_nest(codegen_state, schedule_index):
    # Most of the complexity of this function goes towards finding groups of
    # instructions that can be nested inside a shared conditional.

    kernel = codegen_state.kernel

    # If the AST builder does not implement conditionals, we can save us
    # some work about hoisting conditionals and directly go into recursion.
    if not codegen_state.ast_builder.can_implement_conditionals:
        result = []
        inner = generate_code_for_sched_index(codegen_state, schedule_index)
        if inner is not None:
            result.append(inner)
        return merge_codegen_results(codegen_state, result)

    # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices

    # i.e. go up to the next LeaveLoop, and skip over inner loops.

    my_sched_indices = []

    i = schedule_index
    while i < codegen_state.schedule_index_end:
        sched_item = kernel.schedule[i]

        if isinstance(sched_item, LeaveLoop):
            break

        my_sched_indices.append(i)

        if isinstance(sched_item, (EnterLoop, CallKernel)):
            _, i = gather_schedule_block(kernel.schedule, i)
            assert i <= codegen_state.schedule_index_end, \
                    "schedule block extends beyond schedule_index_end"

        elif isinstance(sched_item, Barrier):
            i += 1

        elif isinstance(sched_item, RunInstruction):
            i += 1
        else:
            raise RuntimeError("unexpected schedule item type: %s"
                    % type(sched_item))

    del i

    # }}}

    # {{{ pass 2: find admissible conditional inames for each sibling schedule item

    from pytools import ImmutableRecord

    class ScheduleIndexInfo(ImmutableRecord):
        """
        .. attribute:: schedule_index
        .. attribute:: admissible_cond_inames
        .. attribute:: required_predicates
        .. attribute:: used_inames_within
        """

    from loopy.schedule import find_used_inames_within
    sched_index_info_entries = [
            ScheduleIndexInfo(
                schedule_indices=[i],
                admissible_cond_inames=(
                    get_admissible_conditional_inames_for(codegen_state, i)),
                required_predicates=get_required_predicates(kernel, i),
                used_inames_within=find_used_inames_within(kernel, i)
                )
            for i in my_sched_indices
            ]

    sched_index_info_entries = group_by(
            sched_index_info_entries,
            key=lambda sii: (
                sii.admissible_cond_inames,
                sii.required_predicates,
                sii.used_inames_within),
            merge=lambda sii1, sii2: sii1.copy(
                schedule_indices=(
                    sii1.schedule_indices
                    +
                    sii2.schedule_indices)))

    # }}}

    # {{{ pass 3: greedily group schedule items that share admissible inames

    from pytools import memoize_method

    class BoundsCheckCache:
        def __init__(self, kernel, impl_domain):
            self.kernel = kernel
            self.impl_domain = impl_domain

        @memoize_method
        def __call__(self, check_inames):
            if not check_inames:
                return []

            domain = isl.align_spaces(
                    self.kernel.get_inames_domain(check_inames),
                    self.impl_domain, obj_bigger_ok=True)
            from loopy.codegen.bounds import get_approximate_convex_bounds_checks
            # Each instruction individually gets its bounds checks,
            # so we can safely overapproximate here.
            return get_approximate_convex_bounds_checks(domain,
                    check_inames, self.impl_domain)

    def build_insn_group(sched_index_info_entries, codegen_state,
            done_group_lengths=set()):
        """
        :arg done_group_lengths: A set of group lengths (integers) that grows
            from empty to include the longest found group and downwards with every
            recursive call.  It serves to prevent infinite recursion by preventing
            recursive calls from doing anything about groups that are too small.
        """

        from loopy.symbolic import get_dependencies

        # The rough plan here is that build_insn_group starts out with the
        # entirety of the current schedule item's downward siblings (i.e. all
        # the ones up to the next LeaveLoop). It will then iterate upward to
        # find the largest usable conditional hoist group.
        #
        # It will then call itself recursively, telling its recursive instances
        # to ignore the hoist group it just found by adding that group length
        # to done_group_length. (It'll also chop the set of schedule indices
        # considered down so that a callee cannot find a *longer* hoist group.)
        #
        # Upon return the hoist is wrapped around the returned code and
        # build_insn_group calls itself for the remainder of schedule indices
        # that were not in the hoist group.

        if not sched_index_info_entries:
            return []

        origin_si_entry = sched_index_info_entries[0]
        current_iname_set = origin_si_entry.admissible_cond_inames
        current_pred_set = (origin_si_entry.required_predicates
                - codegen_state.implemented_predicates)

        # {{{ grow schedule item group

        # Keep growing schedule item group as long as group fulfills minimum
        # size requirement.

        bounds_check_cache = BoundsCheckCache(
                kernel, codegen_state.implemented_domain)

        found_hoists = []

        candidate_group_length = 1
        while candidate_group_length <= len(sched_index_info_entries):
            if candidate_group_length in done_group_lengths:
                candidate_group_length += 1
                continue

            current_iname_set = (
                    current_iname_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .admissible_cond_inames)
            current_pred_set = (
                    current_pred_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .required_predicates)

            current_pred_set = frozenset(
                    pred for pred in current_pred_set
                    if get_dependencies(pred) & kernel.all_inames()
                    <= current_iname_set)

            # {{{ see which inames are actually used in group

            # And only generate conditionals for those.
            used_inames = set()
            for sched_index_info_entry in \
                    sched_index_info_entries[0:candidate_group_length]:
                used_inames |= sched_index_info_entry.used_inames_within

            # }}}

            only_unshared_inames = kernel._remove_inames_for_shared_hw_axes(
                    current_iname_set & used_inames)

            bounds_checks = bounds_check_cache(only_unshared_inames)

            if (bounds_checks  # found a bounds check
                    or current_pred_set
                    or candidate_group_length == 1):
                # length-1 must always be an option to reach the recursion base
                # case below
                found_hoists.append((candidate_group_length,
                    bounds_checks, current_pred_set))

            if not bounds_checks and not current_pred_set:
                # already no more checks possible, let's not waste time
                # checking longer groups.
                break

            candidate_group_length += 1

        # }}}

        # pick largest such group
        group_length, bounds_checks, pred_checks = max(found_hoists)

        check_set = None
        for cns in bounds_checks:
            cns_set = (isl.BasicSet.universe(cns.get_space())
                    .add_constraint(cns))

            if check_set is None:
                check_set = cns_set
            else:
                check_set, cns_set = isl.align_two(check_set, cns_set)
                check_set = check_set.intersect(cns_set)

        if check_set is None:
            new_codegen_state = codegen_state
            is_empty = False
        else:
            is_empty = check_set.is_empty()
            new_codegen_state = codegen_state.intersect(check_set)

        if pred_checks:
            new_codegen_state = new_codegen_state.copy(
                    implemented_predicates=new_codegen_state.implemented_predicates
                    | pred_checks)

        if is_empty:
            result = []
        else:
            if group_length == 1:
                # group only contains starting schedule item
                def gen_code(inner_codegen_state):
                    result = []
                    for i in origin_si_entry.schedule_indices:
                        inner = generate_code_for_sched_index(
                            inner_codegen_state, i)

                        if inner is not None:
                            result.append(inner)

                    return result

            else:
                # recurse with a bigger done_group_lengths
                def gen_code(inner_codegen_state):
                    return build_insn_group(
                            sched_index_info_entries[0:group_length],
                            inner_codegen_state,
                            done_group_lengths=(
                                done_group_lengths | set([group_length])))

            # gen_code returns a list

            if bounds_checks or pred_checks:
                from loopy.symbolic import constraint_to_cond_expr

                prev_gen_code = gen_code

                def gen_code(inner_codegen_state):  # noqa pylint:disable=function-redefined
                    condition_exprs = [
                            constraint_to_cond_expr(cns)
                            for cns in bounds_checks] + [
                                pred_chk for pred_chk in pred_checks]

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [wrap_in_if(
                        inner_codegen_state,
                        condition_exprs,
                        merge_codegen_results(codegen_state, prev_result))]

                cannot_vectorize = False
                if new_codegen_state.vectorization_info is not None:
                    from loopy.isl_helpers import obj_involves_variable
                    for cond in bounds_checks:
                        if obj_involves_variable(
                                cond,
                                new_codegen_state.vectorization_info.iname):
                            cannot_vectorize = True
                            break

                if cannot_vectorize:
                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code(inner_codegen_state)

                    result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                else:
                    result = gen_code(new_codegen_state)

            else:
                result = gen_code(new_codegen_state)

        return result + build_insn_group(
                sched_index_info_entries[group_length:], codegen_state)

    # }}}

    insn_group = build_insn_group(sched_index_info_entries, codegen_state)
    return merge_codegen_results(
            codegen_state,
            insn_group)
Пример #10
0
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
                is_generating_device_code=True,
                gen_program_name=sched_item.kernel_name,
                schedule_index_end=past_end_i-1,
                implemented_data_info=(codegen_state.implemented_data_info
                    + extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
                new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,

            codegen_state.ast_builder.get_kernel_call(
                codegen_state,
                sched_item.kernel_name,
                glob_grid, loc_grid,
                extra_args),
            ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (
                generate_unroll_loop,
                generate_vectorize_loop,
                generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
                LoopedIlpTag, VectorizeTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                    "for '%s', tagged '%s'" % (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        return codegen_state.ast_builder.emit_barrier(
                sched_item.kind, sched_item.comment)

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
                "instruction %s" % insn.id,
                lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s"
                % type(sched_item))