예제 #1
0
 def get_insn_ids_for_block_at(self, sched_index):
     """
     Cached variant of :func:`loopy.schedule.get_insn_ids_for_block_at`.
     """
     from loopy.schedule import get_insn_ids_for_block_at
     return get_insn_ids_for_block_at(self.kernel_proxy.schedule,
                                      sched_index)
예제 #2
0
파일: cuda.py 프로젝트: connorjward/loopy
    def get_function_declaration(self, codegen_state, codegen_result,
                                 schedule_index):
        fdecl = super().get_function_declaration(codegen_state, codegen_result,
                                                 schedule_index)

        from loopy.target.c import FunctionDeclarationWrapper
        assert isinstance(fdecl, FunctionDeclarationWrapper)
        fdecl = fdecl.subdecl

        from cgen.cuda import CudaGlobal, CudaLaunchBounds
        fdecl = CudaGlobal(fdecl)

        if self.target.extern_c:
            from cgen import Extern
            fdecl = Extern("C", fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_grid_size = \
                codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                        get_insn_ids_for_block_at(
                            codegen_state.kernel.linearization, schedule_index),
                        codegen_state.callables_table)

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_grid_size):
            # Sizes can't have parameter dependencies if they are
            # to be used in static thread block size.
            from pytools import product
            nthreads = product(local_grid_size)

            fdecl = CudaLaunchBounds(nthreads, fdecl)

        return FunctionDeclarationWrapper(fdecl)
예제 #3
0
파일: cuda.py 프로젝트: dokempf/loopy
    def get_function_declaration(self, codegen_state, codegen_result,
                                 schedule_index):
        fdecl = super(CUDACASTBuilder,
                      self).get_function_declaration(codegen_state,
                                                     codegen_result,
                                                     schedule_index)

        from cgen.cuda import CudaGlobal, CudaLaunchBounds
        fdecl = CudaGlobal(fdecl)

        if self.target.extern_c:
            from cgen import Extern
            fdecl = Extern("C", fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_grid_size = \
                codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                        get_insn_ids_for_block_at(
                            codegen_state.kernel.schedule, schedule_index))

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_grid_size):
            # Sizes can't have parameter dependencies if they are
            # to be used in static thread block size.
            from pytools import product
            nthreads = product(local_grid_size)

            fdecl = CudaLaunchBounds(nthreads, fdecl)

        return fdecl
예제 #4
0
파일: cuda.py 프로젝트: dokempf/loopy
    def get_function_declaration(self, codegen_state, codegen_result,
            schedule_index):
        fdecl = super(CUDACASTBuilder, self).get_function_declaration(
                codegen_state, codegen_result, schedule_index)

        from cgen.cuda import CudaGlobal, CudaLaunchBounds
        fdecl = CudaGlobal(fdecl)

        if self.target.extern_c:
            from cgen import Extern
            fdecl = Extern("C", fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_grid_size = \
                codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                        get_insn_ids_for_block_at(
                            codegen_state.kernel.schedule, schedule_index))

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_grid_size):
            # Sizes can't have parameter dependencies if they are
            # to be used in static thread block size.
            from pytools import product
            nthreads = product(local_grid_size)

            fdecl = CudaLaunchBounds(nthreads, fdecl)

        return fdecl
예제 #5
0
파일: opencl.py 프로젝트: ml-lab/loopy
    def get_function_declaration(self, codegen_state, codegen_result,
                                 schedule_index):
        fdecl = super(OpenCLCASTBuilder,
                      self).get_function_declaration(codegen_state,
                                                     codegen_result,
                                                     schedule_index)

        from loopy.target.c import FunctionDeclarationWrapper
        assert isinstance(fdecl, FunctionDeclarationWrapper)
        fdecl = fdecl.subdecl

        from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
        fdecl = CLKernel(fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(codegen_state.kernel.schedule,
                                      schedule_index))

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_sizes):
            # sizes can't have parameter dependencies if they are
            # to be used in static WG size.

            fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)

        return FunctionDeclarationWrapper(fdecl)
예제 #6
0
    def get_function_declaration(self, codegen_state, codegen_result,
            schedule_index):
        fdecl = super().get_function_declaration(
                codegen_state, codegen_result, schedule_index)

        from loopy.target.c import FunctionDeclarationWrapper
        assert isinstance(fdecl, FunctionDeclarationWrapper)
        if not codegen_state.is_entrypoint:
            # auxiliary kernels need not mention opencl speicific qualifiers
            # for a functions signature
            return fdecl

        fdecl = fdecl.subdecl

        from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
        fdecl = CLKernel(fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(
                    codegen_state.kernel.linearization, schedule_index),
                codegen_state.callables_table)

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_sizes):
            # sizes can't have parameter dependencies if they are
            # to be used in static WG size.

            fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)

        return FunctionDeclarationWrapper(fdecl)
예제 #7
0
def get_usable_inames_for_conditional(kernel, sched_index):
    from loopy.schedule import (
        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
    from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
                                   VectorizeTag,
                                   IlpBaseTag)

    result = find_active_inames_at(kernel, sched_index)
    crosses_barrier = has_barrier_within(kernel, sched_index)

    # Find our containing subkernel. Grab inames for all insns from there.
    within_subkernel = False

    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
        from loopy.schedule import CallKernel, ReturnFromKernel
        if isinstance(sched_item, CallKernel):
            within_subkernel = True
            subkernel_index = sched_item_index
        elif isinstance(sched_item, ReturnFromKernel):
            within_subkernel = False

    if not within_subkernel:
        # Outside all subkernels - use only inames available to host.
        return frozenset(result)

    insn_ids_for_subkernel = get_insn_ids_for_block_at(
        kernel.schedule, subkernel_index)

    inames_for_subkernel = (
        iname
        for insn in insn_ids_for_subkernel
        for iname in kernel.insn_inames(insn))

    for iname in inames_for_subkernel:
        # Parallel inames are defined within a subkernel, BUT:
        #
        # - local indices may not be used in conditionals that cross barriers.
        #
        # - ILP indices and vector lane indices are not available in loop
        #   bounds, they only get defined at the innermost level of nesting.

        if (
                kernel.iname_tags_of_type(iname, ConcurrentTag)
                and not kernel.iname_tags_of_type(iname, VectorizeTag)
                and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
                    and crosses_barrier)
                and not kernel.iname_tags_of_type(iname, IlpBaseTag)
        ):
            result.add(iname)

    return frozenset(result)
예제 #8
0
파일: bounds.py 프로젝트: inducer/loopy
def get_usable_inames_for_conditional(kernel, sched_index):
    from loopy.schedule import (
        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
    from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
                                   IlpBaseTag)

    result = find_active_inames_at(kernel, sched_index)
    crosses_barrier = has_barrier_within(kernel, sched_index)

    # Find our containing subkernel. Grab inames for all insns from there.
    within_subkernel = False

    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
        from loopy.schedule import CallKernel, ReturnFromKernel
        if isinstance(sched_item, CallKernel):
            within_subkernel = True
            subkernel_index = sched_item_index
        elif isinstance(sched_item, ReturnFromKernel):
            within_subkernel = False

    if not within_subkernel:
        # Outside all subkernels - use only inames available to host.
        return frozenset(result)

    insn_ids_for_subkernel = get_insn_ids_for_block_at(
        kernel.schedule, subkernel_index)

    inames_for_subkernel = (
        iname
        for insn in insn_ids_for_subkernel
        for iname in kernel.insn_inames(insn))

    for iname in inames_for_subkernel:
        # Parallel inames are defined within a subkernel, BUT:
        #
        # - local indices may not be used in conditionals that cross barriers.
        #
        # - ILP indices are not available in loop bounds, they only get defined
        #   at the innermost level of nesting.

        if (
                kernel.iname_tags_of_type(iname, ConcurrentTag)
                and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
                    and crosses_barrier)
                and not kernel.iname_tags_of_type(iname, IlpBaseTag)
        ):
            result.add(iname)

    return frozenset(result)
예제 #9
0
파일: opencl.py 프로젝트: cmsquared/loopy
    def get_function_declaration(self, codegen_state, codegen_result,
            schedule_index):
        fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(
                codegen_state, codegen_result, schedule_index)

        from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
        fdecl = CLKernel(fdecl)

        from loopy.schedule import get_insn_ids_for_block_at
        _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(
                    codegen_state.kernel.schedule, schedule_index))

        from loopy.symbolic import get_dependencies
        if not get_dependencies(local_sizes):
            # sizes can't have parameter dependencies if they are
            # to be used in static WG size.

            fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)

        return fdecl
예제 #10
0
파일: bounds.py 프로젝트: maedoc/loopy
def get_usable_inames_for_conditional(kernel, sched_index):
    from loopy.schedule import (find_active_inames_at,
                                get_insn_ids_for_block_at, has_barrier_within)
    from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag

    result = find_active_inames_at(kernel, sched_index)
    crosses_barrier = has_barrier_within(kernel, sched_index)

    # Find our containing subkernel, grab inames for all insns from there.

    subkernel_index = sched_index
    from loopy.schedule import CallKernel

    while not isinstance(kernel.schedule[subkernel_index], CallKernel):
        subkernel_index -= 1

    insn_ids_for_subkernel = get_insn_ids_for_block_at(kernel.schedule,
                                                       subkernel_index)

    inames_for_subkernel = (iname for insn in insn_ids_for_subkernel
                            for iname in kernel.insn_inames(insn))

    for iname in inames_for_subkernel:
        tag = kernel.iname_to_tag.get(iname)

        # Parallel inames are defined within a subkernel, BUT:
        #
        # - local indices may not be used in conditionals that cross barriers.
        #
        # - ILP indices are not available in loop bounds, they only get defined
        #   at the innermost level of nesting.

        if (isinstance(tag, ParallelTag) and
                not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
                and not isinstance(tag, IlpBaseTag)):
            result.add(iname)

    return frozenset(result)
예제 #11
0
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop,
                                LeaveLoop, ReturnFromKernel,
                                get_insn_ids_for_block_at,
                                gather_schedule_block)

    boostable_insn_ids = _find_boostable_insn_ids(kernel)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = {ax for ax, length in enumerate(group_size)}
        local_axes = {ax for ax, length in enumerate(local_size)}

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
                                   GroupIndexTag)

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            group_axes_used = set()
            local_axes_used = set()

            for iname in insn.within_inames:
                ltags = kernel.iname_tags_of_type(iname,
                                                  LocalIndexTag,
                                                  max_num=1)
                gtags = kernel.iname_tags_of_type(iname,
                                                  GroupIndexTag,
                                                  max_num=1)
                altags = kernel.iname_tags_of_type(iname,
                                                   AutoLocalIndexTagBase,
                                                   max_num=1)

                if ltags:
                    tag, = ltags
                    local_axes_used.add(tag.axis)
                elif gtags:
                    tag, = gtags
                    group_axes_used.add(tag.axis)
                elif altags:
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                if insn.id in boostable_insn_ids:
                    warn("instruction '%s' does not use all group hw axes"
                         " (available: %s used:%s). Loopy will generate code"
                         " with the instruction executed along all the"
                         " missing hw axes. This will result in an"
                         " error from 2021.x onwards, calling"
                         " loopy.add_inames_for_unused_hw_axes(...)"
                         " might help in the transition." %
                         (insn.id, ",".join(str(i) for i in group_axes),
                          ",".join(str(i) for i in group_axes_used)),
                         DeprecationWarning,
                         stacklevel=2)
                else:
                    raise LoopyError(
                        "instruction '%s' does not use all group"
                        " hw axes (available: %s used:%s)" %
                        (insn.id, ",".join(str(i) for i in group_axes),
                         ",".join(str(i) for i in group_axes_used)))

            if local_axes != local_axes_used:
                if insn.id in boostable_insn_ids:
                    warn("instruction '%s' does not use all local hw axes"
                         " (available: %s used:%s). Loopy will generate code"
                         " with the instruction executed along all the"
                         " missing hw axes. This will result in an"
                         " error from 2021.x onwards, calling"
                         " loopy.add_inames_for_unused_hw_axes(...)"
                         " might help in the transition." %
                         (insn.id, ",".join(str(i) for i in local_axes),
                          ",".join(str(i) for i in local_axes_used)),
                         DeprecationWarning,
                         stacklevel=2)
                else:
                    raise LoopyError(
                        "instruction '%s' does not use all local"
                        " hw axes (available: %s used:%s)" %
                        (insn.id, ",".join(str(i) for i in local_axes),
                         ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError("schedule item not understood: %s" %
                            type(sched_item).__name__)

    return past_end_i
예제 #12
0
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block,
                                    get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
            is_generating_device_code=True,
            gen_program_name=sched_item.kernel_name,
            schedule_index_end=past_end_i - 1,
            implemented_data_info=(codegen_state.implemented_data_info +
                                   extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
            new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,
            codegen_state.ast_builder.get_kernel_call(
                codegen_state, sched_item.kernel_name, glob_grid, loc_grid,
                extra_args),
        ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (generate_unroll_loop,
                                        generate_vectorize_loop,
                                        generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                                       ForceSequentialTag, LoopedIlpTag,
                                       VectorizeTag,
                                       InOrderSequentialSequentialTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag,
                                             InOrderSequentialSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                               "for '%s', tagged '%s'" %
                               (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        # {{{ emit barrier code

        from loopy.codegen.result import CodeGenerationResult

        if codegen_state.is_generating_device_code:
            barrier_ast = codegen_state.ast_builder.emit_barrier(
                sched_item.synchronization_kind, sched_item.mem_kind,
                sched_item.comment)
            if sched_item.originating_insn_id:
                return CodeGenerationResult.new(
                    codegen_state, sched_item.originating_insn_id, barrier_ast,
                    codegen_state.implemented_domain)
            else:
                return barrier_ast
        else:
            # host code
            if sched_item.synchronization_kind in ["global", "local"]:
                # host code is assumed globally and locally synchronous
                return CodeGenerationResult(
                    host_program=None,
                    device_programs=[],
                    implemented_domains={},
                    implemented_data_info=codegen_state.implemented_data_info)

            else:
                raise LoopyError("do not know how to emit code for barrier "
                                 "synchronization kind '%s'"
                                 "in host code" %
                                 sched_item.synchronization_kind)

        # }}}

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
            "instruction %s" % insn.id,
            lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s" %
                           type(sched_item))
예제 #13
0
파일: check.py 프로젝트: cmsquared/loopy
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                tag = kernel.iname_to_tag.get(iname)

                if isinstance(tag, LocalIndexTag):
                    local_axes_used.add(tag.axis)
                elif isinstance(tag, GroupIndexTag):
                    group_axes_used.add(tag.axis)
                elif isinstance(tag, AutoLocalIndexTagBase):
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
예제 #14
0
파일: check.py 프로젝트: yueyedeai/loopy
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
    from loopy.schedule import (CallKernel, RunInstruction,
            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
            get_insn_ids_for_block_at, gather_schedule_block)

    if sched_index is None:
        group_axes = set()
        local_axes = set()

        i = 0
        loop_end_i = past_end_i = len(kernel.schedule)
    else:
        assert isinstance(kernel.schedule[sched_index], CallKernel)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        group_axes = set(ax for ax, length in enumerate(group_size))
        local_axes = set(ax for ax, length in enumerate(local_size))

        i = sched_index + 1
        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
        loop_end_i = past_end_i - 1

    # alternative: just disregard length-1 dimensions?

    from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
                        GroupIndexTag)

    while i < loop_end_i:
        sched_item = kernel.schedule[i]
        if isinstance(sched_item, CallKernel):
            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)

        elif isinstance(sched_item, RunInstruction):
            insn = kernel.id_to_insn[sched_item.insn_id]
            i += 1

            if insn.boostable:
                continue

            group_axes_used = set()
            local_axes_used = set()

            for iname in kernel.insn_inames(insn):
                ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
                gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1)
                altags = kernel.iname_tags_of_type(
                        iname, AutoLocalIndexTagBase, max_num=1)

                if ltags:
                    tag, = ltags
                    local_axes_used.add(tag.axis)
                elif gtags:
                    tag, = gtags
                    group_axes_used.add(tag.axis)
                elif altags:
                    raise LoopyError("auto local tag encountered")

            if group_axes != group_axes_used:
                raise LoopyError("instruction '%s' does not use all group hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in group_axes),
                            ",".join(str(i) for i in group_axes_used)))
            if local_axes != local_axes_used:
                raise LoopyError("instruction '%s' does not use all local hw axes "
                        "(available: %s used:%s)"
                        % (insn.id,
                            ",".join(str(i) for i in local_axes),
                            ",".join(str(i) for i in local_axes_used)))

        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
            i += 1
            continue

        else:
            raise TypeError(
                    "schedule item not understood: %s" % type(sched_item).__name__)

    return past_end_i
예제 #15
0
파일: control.py 프로젝트: cmsquared/loopy
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
                is_generating_device_code=True,
                gen_program_name=sched_item.kernel_name,
                schedule_index_end=past_end_i-1,
                implemented_data_info=(codegen_state.implemented_data_info
                    + extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
                new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,

            codegen_state.ast_builder.get_kernel_call(
                codegen_state,
                sched_item.kernel_name,
                glob_grid, loc_grid,
                extra_args),
            ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (
                generate_unroll_loop,
                generate_vectorize_loop,
                generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
                LoopedIlpTag, VectorizeTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                    "for '%s', tagged '%s'" % (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        return codegen_state.ast_builder.emit_barrier(
                sched_item.kind, sched_item.comment)

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
                "instruction %s" % insn.id,
                lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s"
                % type(sched_item))
예제 #16
0
def set_up_hw_parallel_loops(codegen_state,
                             schedule_index,
                             next_func,
                             hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                                   LocalIndexTag, GroupIndexTag, VectorizeTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule,
                                                   schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [
            iname for iname in all_inames_by_insns
            if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
            and not kernel.iname_tags_of_type(iname, VectorizeTag)
        ]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
        insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname and any(
                _tag.key == tag.key
                for _tag in kernel.iname_tags(other_iname) if _tag))
    ]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
                                       constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname, lower_bound,
                     lower_bound + hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                           "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                codegen_state.ast_builder.emit_comment("%s slab for '%s'" %
                                                       (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state.copy_and_assign(
            iname, hw_axis_expr).copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index,
                                         next_func, hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)
예제 #17
0
파일: control.py 프로젝트: inducer/loopy
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
                is_generating_device_code=True,
                gen_program_name=sched_item.kernel_name,
                schedule_index_end=past_end_i-1,
                implemented_data_info=(codegen_state.implemented_data_info
                    + extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
                new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,

            codegen_state.ast_builder.get_kernel_call(
                codegen_state,
                sched_item.kernel_name,
                glob_grid, loc_grid,
                extra_args),
            ])

    elif isinstance(sched_item, EnterLoop):
        tags = kernel.iname_tags(sched_item.iname)
        tags = tuple(tag for tag in tags if tag)

        from loopy.codegen.loop import (
                generate_unroll_loop,
                generate_vectorize_loop,
                generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                ForceSequentialTag, LoopedIlpTag, VectorizeTag,
                InOrderSequentialSequentialTag, filter_iname_tags_by_type)
        if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif filter_iname_tags_by_type(tags, VectorizeTag):
            func = generate_vectorize_loop
        elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag,
                    ForceSequentialTag, InOrderSequentialSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                    "for '%s', tagged '%s'"
                    % (sched_item.iname, ", ".join(str(tag) for tag in tags)))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        # {{{ emit barrier code

        from loopy.codegen.result import CodeGenerationResult

        if codegen_state.is_generating_device_code:
            barrier_ast = codegen_state.ast_builder.emit_barrier(
                    sched_item.synchronization_kind, sched_item.mem_kind,
                    sched_item.comment)
            if sched_item.originating_insn_id:
                return CodeGenerationResult.new(
                        codegen_state,
                        sched_item.originating_insn_id,
                        barrier_ast,
                        codegen_state.implemented_domain)
            else:
                return barrier_ast
        else:
            # host code
            if sched_item.synchronization_kind in ["global", "local"]:
                # host code is assumed globally and locally synchronous
                return CodeGenerationResult(
                        host_program=None,
                        device_programs=[],
                        implemented_domains={},
                        implemented_data_info=codegen_state.implemented_data_info)

            else:
                raise LoopyError("do not know how to emit code for barrier "
                                 "synchronization kind '%s'" "in host code"
                                 % sched_item.synchronization_kind)

        # }}}

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
                "instruction %s" % insn.id,
                lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s"
                % type(sched_item))
예제 #18
0
파일: control.py 프로젝트: shigh/loopy
def generate_code_for_sched_index(codegen_state, sched_index):
    kernel = codegen_state.kernel
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, CallKernel):
        assert not codegen_state.is_generating_device_code

        from loopy.schedule import (gather_schedule_block,
                                    get_insn_ids_for_block_at)
        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
        assert past_end_i <= codegen_state.schedule_index_end

        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)

        new_codegen_state = codegen_state.copy(
            is_generating_device_code=True,
            gen_program_name=sched_item.kernel_name,
            schedule_index_end=past_end_i - 1,
            implemented_data_info=(codegen_state.implemented_data_info +
                                   extra_args))

        from loopy.codegen.result import generate_host_or_device_program
        codegen_result = generate_host_or_device_program(
            new_codegen_state, sched_index)

        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
            get_insn_ids_for_block_at(kernel.schedule, sched_index))

        return merge_codegen_results(codegen_state, [
            codegen_result,
            codegen_state.ast_builder.get_kernel_call(
                codegen_state, sched_item.kernel_name, glob_grid, loc_grid,
                extra_args),
        ])

    elif isinstance(sched_item, EnterLoop):
        tag = kernel.iname_to_tag.get(sched_item.iname)

        from loopy.codegen.loop import (generate_unroll_loop,
                                        generate_vectorize_loop,
                                        generate_sequential_loop_dim_code)

        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                                       ForceSequentialTag, LoopedIlpTag,
                                       VectorizeTag)
        if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
            func = generate_unroll_loop
        elif isinstance(tag, VectorizeTag):
            func = generate_vectorize_loop
        elif tag is None or isinstance(tag,
                                       (LoopedIlpTag, ForceSequentialTag)):
            func = generate_sequential_loop_dim_code
        else:
            raise RuntimeError("encountered (invalid) EnterLoop "
                               "for '%s', tagged '%s'" %
                               (sched_item.iname, tag))

        return func(codegen_state, sched_index)

    elif isinstance(sched_item, Barrier):
        return codegen_state.ast_builder.emit_barrier(sched_item.kind,
                                                      sched_item.comment)

    elif isinstance(sched_item, RunInstruction):
        insn = kernel.id_to_insn[sched_item.insn_id]

        from loopy.codegen.instruction import generate_instruction_code
        return codegen_state.try_vectorized(
            "instruction %s" % insn.id,
            lambda inner_cgs: generate_instruction_code(inner_cgs, insn))

    else:
        raise RuntimeError("unexpected schedule item type: %s" %
                           type(sched_item))
예제 #19
0
파일: loop.py 프로젝트: inducer/loopy
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
        hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                LocalIndexTag, GroupIndexTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [iname for iname in all_inames_by_insns
                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
            insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname
            and any(_tag.key == tag.key
                    for _tag in kernel.iname_tags(other_iname)
                    if _tag))]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                    codegen_state.ast_builder.emit_comment(
                        "%s slab for '%s'" % (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state
                .copy_and_assign(iname, hw_axis_expr)
                .copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(
                new_codegen_state, schedule_index, next_func,
                hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)