Пример #1
0
    def program_info(self,
                     entrypoint,
                     arg_to_dtype_set=frozenset(),
                     all_kwargs=None):
        program = self.get_typed_and_scheduled_translation_unit(
            entrypoint, arg_to_dtype_set)

        from loopy.codegen import generate_code_v2
        codegen_result = generate_code_v2(program)

        dev_code = codegen_result.device_code()
        host_code = codegen_result.host_code()
        all_code = "\n".join([dev_code, "", host_code])

        if self.program[entrypoint].options.write_cl:
            output = all_code
            if self.program[entrypoint].options.highlight_cl:
                output = get_highlighted_code(output)

            if self.program[entrypoint].options.write_cl is True:
                print(output)
            else:
                with open(self.program[entrypoint].options.write_cl,
                          "w") as outf:
                    outf.write(output)

        if self.program[entrypoint].options.edit_cl:
            from pytools import invoke_editor
            dev_code = invoke_editor(dev_code, "code.c")
            # update code from editor
            all_code = "\n".join([dev_code, "", host_code])

        c_kernels = []

        for dp in codegen_result.device_programs:
            c_kernels.append(
                CompiledCKernel(
                    dp, codegen_result.implemented_data_infos[entrypoint],
                    all_code, self.program.target, self.compiler))

        return _KernelInfo(program=program,
                           c_kernels=c_kernels,
                           implemented_data_info=codegen_result.
                           implemented_data_infos[entrypoint],
                           invoker=self.get_invoker(program, entrypoint,
                                                    codegen_result))
Пример #2
0
    def translation_unit_info(self, entrypoint, arg_to_dtype_set=frozenset(),
            all_kwargs=None):
        program = self.get_typed_and_scheduled_translation_unit(
                entrypoint, arg_to_dtype_set)

        # FIXME: now just need to add the types to the arguments
        from loopy.codegen import generate_code_v2
        from loopy.target.execution import get_highlighted_code
        codegen_result = generate_code_v2(program)

        dev_code = codegen_result.device_code()

        if program[entrypoint].options.write_cl:
            #FIXME: redirect to "translation unit" level option as well.
            output = dev_code
            if self.program[entrypoint].options.highlight_cl:
                output = get_highlighted_code(output)

            if self.program[entrypoint].options.write_cl is True:
                print(output)
            else:
                with open(self.program[entrypoint].options.write_cl, "w") as outf:
                    outf.write(output)

        if program[entrypoint].options.edit_cl:
            #FIXME: redirect to "translation unit" level option as well.
            from pytools import invoke_editor
            dev_code = invoke_editor(dev_code, "code.cl")

        import pyopencl as cl

        #FIXME: redirect to "translation unit" level option as well.
        cl_program = (
                cl.Program(self.context, dev_code)
                .build(options=program[entrypoint].options.cl_build_options))

        cl_kernels = _Kernels()
        for dp in cl_program.kernel_names.split(";"):
            setattr(cl_kernels, dp, getattr(cl_program, dp))

        return _KernelInfo(
                program=program,
                cl_kernels=cl_kernels,
                implemented_data_info=codegen_result.implemented_data_infos[
                    entrypoint],
                invoker=self.get_invoker(program, entrypoint, codegen_result))
Пример #3
0
    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)

        from loopy.codegen import generate_code_v2
        from loopy.target.execution import get_highlighted_code
        codegen_result = generate_code_v2(kernel)

        dev_code = codegen_result.device_code()

        if self.kernel.options.write_cl:
            output = dev_code
            if self.kernel.options.highlight_cl:
                output = get_highlighted_code(output)

            if self.kernel.options.write_cl is True:
                print(output)
            else:
                with open(self.kernel.options.write_cl, "w") as outf:
                    outf.write(output)

        if self.kernel.options.edit_cl:
            from pytools import invoke_editor
            dev_code = invoke_editor(dev_code, "code.cl")

        import pyopencl as cl

        cl_program = (cl.Program(
            self.context,
            dev_code).build(options=kernel.options.cl_build_options))

        cl_kernels = _Kernels()
        for dp in codegen_result.device_programs:
            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))

        return _KernelInfo(
            kernel=kernel,
            cl_kernels=cl_kernels,
            implemented_data_info=codegen_result.implemented_data_info,
            invoker=self.get_invoker(kernel, codegen_result))
Пример #4
0
    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)

        from loopy.codegen import generate_code_v2
        from loopy.target.execution import get_highlighted_code
        codegen_result = generate_code_v2(kernel)

        dev_code = codegen_result.device_code()

        if self.kernel.options.write_cl:
            output = dev_code
            if self.kernel.options.highlight_cl:
                output = get_highlighted_code(output)

            if self.kernel.options.write_cl is True:
                print(output)
            else:
                with open(self.kernel.options.write_cl, "w") as outf:
                    outf.write(output)

        if self.kernel.options.edit_cl:
            from pytools import invoke_editor
            dev_code = invoke_editor(dev_code, "code.cl")

        import pyopencl as cl

        cl_program = (
                cl.Program(self.context, dev_code)
                .build(options=kernel.options.cl_build_options))

        cl_kernels = _Kernels()
        for dp in codegen_result.device_programs:
            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))

        return _KernelInfo(
                kernel=kernel,
                cl_kernels=cl_kernels,
                implemented_data_info=codegen_result.implemented_data_info,
                invoker=self.get_invoker(kernel, codegen_result))
Пример #5
0
    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)

        from loopy.codegen import generate_code_v2
        codegen_result = generate_code_v2(kernel)

        dev_code = codegen_result.device_code()
        host_code = codegen_result.host_code()
        all_code = '\n'.join([dev_code, '', host_code])

        if self.kernel.options.write_cl:
            output = all_code
            if self.kernel.options.highlight_cl:
                output = get_highlighted_code(output)

            if self.kernel.options.write_cl is True:
                print(output)
            else:
                with open(self.kernel.options.write_cl, "w") as outf:
                    outf.write(output)

        if self.kernel.options.edit_cl:
            from pytools import invoke_editor
            dev_code = invoke_editor(dev_code, "code.c")
            # update code from editor
            all_code = '\n'.join([dev_code, '', host_code])

        c_kernels = []
        for dp in codegen_result.device_programs:
            c_kernels.append(CompiledCKernel(dp,
                codegen_result.implemented_data_info, all_code, self.kernel.target,
                self.compiler))

        return _KernelInfo(
                kernel=kernel,
                c_kernels=c_kernels,
                implemented_data_info=codegen_result.implemented_data_info,
                invoker=self.get_invoker(kernel, codegen_result))
Пример #6
0
def auto_test_vs_ref(ref_knl,
                     ctx,
                     test_knl=None,
                     op_count=[],
                     op_label=[],
                     parameters={},
                     print_ref_code=False,
                     print_code=True,
                     warmup_rounds=2,
                     dump_binary=False,
                     fills_entire_output=None,
                     do_check=True,
                     check_result=None,
                     max_test_kernel_count=1,
                     quiet=False,
                     blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                         "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

    from loopy.compiled import CompiledKernel
    from loopy.target.execution import get_highlighted_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated",
             DeprecationWarning,
             stacklevel=2)

    # {{{ compile and run reference code

    from loopy.type_inference import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    from loopy.kernel.data import ImageArg
    need_ref_image_support = any(
        isinstance(arg, ImageArg) for arg in ref_knl.args)

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors,
                                                  need_ref_image_support):

        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(
            ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("{} (ref): trying {} for the reference calculation".format(
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75 * "-")
            print("Reference Code:")
            print(75 * "-")
            print(get_highlighted_code(ref_compiled.get_code()))
            print(75 * "-")

        ref_kernel_info = ref_compiled.kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75 * "-",
                    "On %s:" % dev, 75 * "-",
                    traceback.format_exc(), 75 * "-"
                ]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("{} (ref): using {} for the reference calculation".format(
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop - ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9 * (ref_evt.profile.END -
                                    ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                         "reference computation.\n"
                         "These errors were encountered:\n" +
                         "\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    from loopy.kernel import KernelState
    from loopy.target.pyopencl import PyOpenCLTarget
    if test_knl.state not in [
            KernelState.PREPROCESSED, KernelState.LINEARIZED
    ]:
        if isinstance(test_knl.target, PyOpenCLTarget):
            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))

        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.type_inference import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        kernel_info = compiled.kernel_info(frozenset())

        args = make_args(kernel, kernel_info.implemented_data_info, queue,
                         ref_arg_data, parameters)

        args["out_host"] = False

        if not quiet:
            print(75 * "-")
            print("Kernel #%d:" % i)
            print(75 * "-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75 * "-")
            if dump_binary:
                # {{{ find cl program

                for name in dir(kernel_info.cl_kernels):
                    if name.startswith("__"):
                        continue
                    cl_kernel = getattr(kernel_info.cl_kernels, name)
                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
                    break
                else:
                    assert False, "could not find cl_program"

                # }}}

                print(type(cl_program))
                if hasattr(cl_program, "binaries"):
                    print(cl_program.binaries[0])

                print(75 * "-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                        arg_desc.ref_storage_array.get(),
                        shape=arg_desc.ref_shape,
                        strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                        arg_desc.test_storage_array.get(),
                        shape=arg_desc.test_shape,
                        strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = max(warmup_rounds, 1)

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9 * evt_end.profile.START -
                                         1e-9 * evt_start.profile.START) /
                                        timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time - start_time) / timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl)

        if not quiet:

            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                  "(%d rounds)%s" %
                  (format_float_or_none(elapsed_event),
                   format_float_or_none(elapsed_event_marker),
                   format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: {:g} s event, {:g} s wall{}".format(
                    ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict
Пример #7
0
def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        from loopy.kernel.instruction import BarrierInstruction
        from loopy.kernel.data import LocalIndexTag
        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            non_lid_inames = frozenset(iname for iname in insn_inames
                if not kernel.iname_tags_of_type(iname, LocalIndexTag))
            insn_impl_domain = insn_impl_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            desired_domain = desired_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_impl_domain.dim(dim_type.param)))

            lines = []
            for bigger, smaller, diff_set, gist_domain in [
                    ("implemented", "desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired", "implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s but not %s: %s" % (
                            bigger, smaller, ", ".join(point_axes)))
                lines.append(
                        "gist of constraints in %s but not %s: %s" % (
                            smaller, bigger, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.target.execution import get_highlighted_code
                print(get_highlighted_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True
Пример #8
0
def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        from loopy.kernel.instruction import BarrierInstruction
        from loopy.kernel.data import LocalIndexTag
        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            non_lid_inames = frozenset(iname for iname in insn_inames
                if not kernel.iname_tags_of_type(iname, LocalIndexTag))
            insn_impl_domain = insn_impl_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            desired_domain = desired_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_impl_domain.dim(dim_type.param)))

            lines = []
            for bigger, smaller, diff_set, gist_domain in [
                    ("implemented", "desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired", "implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s but not %s: %s" % (
                            bigger, smaller, ", ".join(point_axes)))
                lines.append(
                        "gist of constraints in %s but not %s: %s" % (
                            smaller, bigger, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.target.execution import get_highlighted_code
                print(get_highlighted_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True
Пример #9
0
def auto_test_vs_ref(
        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
        print_ref_code=False, print_code=True, warmup_rounds=2,
        dump_binary=False,
        fills_entire_output=None, do_check=True, check_result=None,
        max_test_kernel_count=1,
        quiet=False, blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

    from loopy.compiled import CompiledKernel
    from loopy.target.execution import get_highlighted_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2)

    # {{{ compile and run reference code

    from loopy.type_inference import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(ref_ctx,
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("%s (ref): trying %s for the reference calculation" % (
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75*"-")
            print("Reference Code:")
            print(75*"-")
            print(get_highlighted_code(ref_compiled.get_code()))
            print(75*"-")

        ref_kernel_info = ref_compiled.kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75*"-",
                    "On %s:" % dev,
                    75*"-",
                    traceback.format_exc(),
                    75*"-"]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("%s (ref): using %s for the reference calculation" % (
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop-ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                "reference computation.\n"
                "These errors were encountered:\n"+"\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)

    from loopy.kernel import KernelState
    from loopy.target.pyopencl import PyOpenCLTarget
    if test_knl.state not in [
            KernelState.PREPROCESSED,
            KernelState.SCHEDULED]:
        if isinstance(test_knl.target, PyOpenCLTarget):
            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))

        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.type_inference import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        kernel_info = compiled.kernel_info(frozenset())

        args = make_args(kernel,
                kernel_info.implemented_data_info,
                queue, ref_arg_data, parameters)

        args["out_host"] = False

        if not quiet:
            print(75*"-")
            print("Kernel #%d:" % i)
            print(75*"-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75*"-")
            if dump_binary:
                # {{{ find cl program

                for name in dir(kernel_info.cl_kernels):
                    if name.startswith("__"):
                        continue
                    cl_kernel = getattr(kernel_info.cl_kernels, name)
                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
                    break
                else:
                    assert False, "could not find cl_program"

                # }}}

                print(type(cl_program))
                if hasattr(cl_program, "binaries"):
                    print(cl_program.binaries[0])

                print(75*"-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                            arg_desc.ref_storage_array.get(),
                            shape=arg_desc.ref_shape,
                            strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                            arg_desc.test_storage_array.get(),
                            shape=arg_desc.test_shape,
                            strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = warmup_rounds

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9*evt_end.profile.START
                            - 1e-9*evt_start.profile.START)
                        / timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time-start_time)/timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)

        if not quiet:
            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                    "(%d rounds)%s" % (
                        format_float_or_none(elapsed_event),
                        format_float_or_none(elapsed_event_marker),
                        format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: %g s event, %g s wall%s" % (
                        ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict