def generate_test_wrapper(self, tensor_descriptors, input_tables,
                              output_tables):
        auto_test = CodeFunction("test_wrapper", output_format=ML_Int32)

        tested_function = self.implementation.get_function_object()
        function_name = self.implementation.get_name()

        failure_report_op = FunctionOperator("report_failure")
        failure_report_function = FunctionObject("report_failure", [], ML_Void,
                                                 failure_report_op)

        printf_success_op = FunctionOperator(
            "printf",
            arg_map={0: "\"test successful %s\\n\"" % function_name},
            void_function=True,
            require_header=["stdio.h"])
        printf_success_function = FunctionObject("printf", [], ML_Void,
                                                 printf_success_op)

        # accumulate element number
        acc_num = Variable("acc_num",
                           precision=ML_Int64,
                           var_type=Variable.Local)

        test_loop = self.get_tensor_test_wrapper(
            tested_function, tensor_descriptors, input_tables, output_tables,
            acc_num, self.generate_tensor_check_loop)

        # common test scheme between scalar and vector functions
        test_scheme = Statement(test_loop, printf_success_function(),
                                Return(Constant(0, precision=ML_Int32)))
        auto_test.set_scheme(test_scheme)
        return FunctionGroup([auto_test])
예제 #2
0
  def generate_scheme(self):
    #func_implementation = CodeFunction(self.function_name, output_format = self.precision)
    vx = self.implementation.add_input_variable("x", self.get_input_precision()) 

    mpfr_x = Conversion(vx, precision = ML_Mpfr_t)


    emulate_func_name = "mpfr_exp"
    emulate_func_op = FunctionOperator(emulate_func_name, arg_map = {0: FO_Result(0), 1: FO_Arg(0)}, require_header = ["mpfr.h"]) 
    emulate_func   = FunctionObject(emulate_func_name, [ML_Mpfr_t], ML_Mpfr_t, emulate_func_op)
    emulate_func_op.declare_prototype = emulate_func
    mpfr_call = Conversion(emulate_func(mpfr_x), precision = self.precision)

    scheme = Statement(Return(mpfr_call))

    return scheme
예제 #3
0
  def generate_emulate(self, result, mpfr_x, mpfr_rnd):
    """ generate the emulation code for ML_Log2 functions
        mpfr_x is a mpfr_t variable which should have the right precision
        mpfr_rnd is the rounding mode
    """
    emulate_func_name = "mpfr_acos"
    emulate_func_op = FunctionOperator(emulate_func_name, arg_map = {0: FO_Result(0), 1: FO_Arg(0), 2: FO_Arg(1)}, require_header = ["mpfr.h"]) 
    emulate_func   = FunctionObject(emulate_func_name, [ML_Mpfr_t, ML_Int32], ML_Mpfr_t, emulate_func_op)
    mpfr_call = Statement(ReferenceAssign(result, emulate_func(mpfr_x, mpfr_rnd)))

    return mpfr_call
예제 #4
0
    def generate_emulate(self, result_ternary, result, mpfr_x, mpfr_rnd):
        """ generate the emulation code for ML_FastSinCos functions
        mpfr_x is a mpfr_t variable which should have the right precision
        mpfr_rnd is the rounding mode
    """
        emulate_func_name = "mpfr_cos" if self.cos_output else "mpfr_sin"
        emulate_func_op = FunctionOperator(emulate_func_name,
                                           arg_map={
                                               0: FO_Arg(0),
                                               1: FO_Arg(1),
                                               2: FO_Arg(2)
                                           },
                                           require_header=["mpfr.h"])
        emulate_func = FunctionObject(emulate_func_name,
                                      [ML_Mpfr_t, ML_Mpfr_t, ML_Int32],
                                      ML_Int32, emulate_func_op)
        mpfr_call = Statement(
            ReferenceAssign(result_ternary,
                            emulate_func(result, mpfr_x, mpfr_rnd)))

        return mpfr_call
예제 #5
0
    def generate_emulate(self, result, mpfr_x, mpfr_rnd):
        """ generate the emulation code for ML_Log2 functions
        mpfr_x is a mpfr_t variable which should have the right precision
        mpfr_rnd is the rounding mode
    """
        #mpfr_x = emulate_implementation.add_input_variable("x", ML_Mpfr_t)
        #mpfr_rnd = emulate_implementation.add_input_variable("rnd", ML_Int32)
        emulate_func_name = "mpfr_log10"
        emulate_func_op = FunctionOperator(emulate_func_name,
                                           arg_map={
                                               0: FO_Result(0),
                                               1: FO_Arg(0),
                                               2: FO_Arg(1)
                                           },
                                           require_header=["mpfr.h"])
        emulate_func = FunctionObject(emulate_func_name, [ML_Mpfr_t, ML_Int32],
                                      ML_Mpfr_t, emulate_func_op)
        #emulate_func_op.declare_prototype = emulate_func
        mpfr_call = Statement(
            ReferenceAssign(result, emulate_func(mpfr_x, mpfr_rnd)))

        return mpfr_call
예제 #6
0
    def generate_scheme(self):
        # declaring CodeFunction and retrieving input variable
        inputs = [
            self.implementation.add_input_variable("x%d" % i, precision)
            for i, precision in enumerate(self.get_input_precisions())
        ]

        external_function_op = FunctionOperator(
            self.bench_function_name,
            arity=self.arity,
            output_precision=self.precision,
            require_header=self.headers)
        external_function = FunctionObject(self.bench_function_name,
                                           self.get_input_precisions(),
                                           self.precision,
                                           external_function_op)

        scheme = Statement(
            Return(
                external_function(*tuple(inputs)),
                precision=self.precision,
            ))

        return scheme
예제 #7
0
      #switch_map[sub_half] = Return(-poly_scheme_vector[sub_half])
      #switch_map[half + sub_half] = Return(poly_scheme_vector[sub_half])
      switch_map[(sub_half, half + sub_half)] = Return(factor2 * poly_scheme_vector[sub_half])


    result = SwitchBlock(modk, switch_map)

    #######################################################################
    #                    LARGE ARGUMENT MANAGEMENT                        #
    #                 (lar: Large Argument Reduction)                     #
    #######################################################################

    # payne and hanek argument reduction for large arguments
    #red_func_name = "payne_hanek_cosfp32" # "payne_hanek_fp32_asm"
    red_func_name = "payne_hanek_fp32_asm"
    payne_hanek_func_op = FunctionOperator(red_func_name, arg_map = {0: FO_Arg(0)}, require_header = ["support_lib/ml_red_arg.h"]) 
    payne_hanek_func   = FunctionObject(red_func_name, [ML_Binary32], ML_Binary64, payne_hanek_func_op)
    payne_hanek_func_op.declare_prototype = payne_hanek_func
    #large_arg_red = FunctionCall(payne_hanek_func, vx)
    large_arg_red = payne_hanek_func(vx)
    red_bound     = S2**20
    
    cond = Abs(vx) >= red_bound
    cond.set_attributes(tag = "cond", likely = False)


    
    lar_neark = NearestInteger(large_arg_red, precision = ML_Int64)
    lar_modk = Modulo(lar_neark, Constant(16, precision = ML_Int64), tag = "lar_modk", debug = True) 
    # Modulo is supposed to be already performed (by payne_hanek_cosfp32)
    #lar_modk = NearestInteger(large_arg_red, precision = ML_Int64)
예제 #8
0
    def generate_scheme(self):
        # declaring target and instantiating optimization engine
        precision_ptr = self.get_input_precision(0)
        index_format = self.get_input_precision(2)
        multi_elt_num = self.multi_elt_num

        dst = self.implementation.add_input_variable("dst", precision_ptr)
        src = self.implementation.add_input_variable("src", precision_ptr)
        n = self.implementation.add_input_variable("len", index_format)

        i = Variable("i", precision=index_format, var_type=Variable.Local)
        CU0 = Constant(0, precision=index_format)

        element_format = self.precision

        self.function_list = []

        if multi_elt_num > 1:
            element_format = VECTOR_TYPE_MAP[self.precision][multi_elt_num]

        elt_input = TableLoad(src, i, precision=element_format)

        local_exp = Variable("local_exp",
                             precision=element_format,
                             var_type=Variable.Local)

        if self.use_libm_function:
            libm_fct_operator = FunctionOperator(self.use_libm_function,
                                                 arity=1)
            libm_fct = FunctionObject(self.use_libm_function, [ML_Binary32],
                                      ML_Binary32, libm_fct_operator)

            if multi_elt_num > 1:
                result_list = [
                    libm_fct(
                        VectorElementSelection(elt_input,
                                               Constant(elt_id,
                                                        precision=ML_Integer),
                                               precision=self.precision))
                    for elt_id in range(multi_elt_num)
                ]
                result = VectorAssembling(*result_list,
                                          precision=element_format)
            else:
                result = libm_fct(elt_input)
            elt_result = ReferenceAssign(local_exp, result)
        else:
            if multi_elt_num > 1:
                scalar_result = Variable("scalar_result",
                                         precision=self.precision,
                                         var_type=Variable.Local)
                fct_ctor_args = self.function_ctor.get_default_args(
                    precision=self.precision,
                    libm_compliant=False,
                )

                meta_function = self.function_ctor(fct_ctor_args)
                exponential_scheme = meta_function.generate_scheme()

                # instanciating required passes for typing
                pass_inst_abstract_prec = PassInstantiateAbstractPrecision(
                    self.processor)
                pass_inst_prec = PassInstantiatePrecision(
                    self.processor, default_precision=None)

                # exectuting format instanciation passes on optree
                exponential_scheme = pass_inst_abstract_prec.execute_on_optree(
                    exponential_scheme)
                exponential_scheme = pass_inst_prec.execute_on_optree(
                    exponential_scheme)

                vectorizer = StaticVectorizer()

                # extracting scalar argument from meta_exponential meta function
                scalar_input = meta_function.implementation.arg_list[0]

                # vectorize scalar scheme
                vector_result, vec_arg_list, vector_scheme, scalar_callback, scalar_callback_fct = vectorize_function_scheme(
                    vectorizer,
                    self.get_main_code_object(), exponential_scheme,
                    element_format.get_scalar_format(), [scalar_input],
                    multi_elt_num)

                elt_result = inline_function(vector_scheme, vector_result,
                                             {vec_arg_list[0]: elt_input})

                local_exp = vector_result

                self.function_list.append(scalar_callback_fct)
                libm_fct = scalar_callback

            else:
                scalar_input = elt_input
                scalar_result = local_exp

                elt_result = generate_inline_fct_scheme(
                    self.function_ctor, scalar_result, [scalar_input], {
                        "precision": self.precision,
                        "libm_compliant": False
                    })

        CU1 = Constant(1, precision=index_format)

        local_exp_init_value = Constant(0, precision=self.precision)
        if multi_elt_num > 1:
            local_exp_init_value = Constant([0] * multi_elt_num,
                                            precision=element_format)
            remain_n = Modulo(n, multi_elt_num, precision=index_format)
            iter_n = n - remain_n
            CU_ELTNUM = Constant(multi_elt_num, precision=index_format)
            inc = i + CU_ELTNUM
        else:
            remain_n = None
            iter_n = n
            inc = i + CU1

        # main loop processing multi_elt_num element(s) per iteration
        main_loop = Loop(
            ReferenceAssign(i, CU0),
            i < iter_n,
            Statement(ReferenceAssign(local_exp, local_exp_init_value),
                      elt_result,
                      TableStore(local_exp, dst, i, precision=ML_Void),
                      ReferenceAssign(i, inc)),
        )
        # epilog to process remaining item (when the length is not a multiple
        # of multi_elt_num)
        if not remain_n is None:
            # TODO/FIXME: try alternative method for processing epilog
            #             by using full vector length and mask
            epilog_loop = Loop(
                Statement(), i < n,
                Statement(
                    TableStore(libm_fct(
                        TableLoad(src, i, precision=self.precision)),
                               dst,
                               i,
                               precision=ML_Void),
                    ReferenceAssign(i, i + CU1),
                ))
            main_loop = Statement(main_loop, epilog_loop)

        return main_loop
예제 #9
0
    def generate_scheme(self):
        # declaring CodeFunction and retrieving input variable
        vx = Abs(self.implementation.add_input_variable("x", self.precision),
                 tag="vx")

        Log.report(Log.Info, "generating implementation scheme")
        if self.debug_flag:
            Log.report(Log.Info, "debug has been enabled")

        # local overloading of RaiseReturn operation
        def ExpRaiseReturn(*args, **kwords):
            kwords["arg_value"] = vx
            kwords["function_name"] = self.function_name
            return RaiseReturn(*args, **kwords)

        debug_precision = {
            ML_Binary32: debug_ftox,
            ML_Binary64: debug_lftolx
        }[self.precision]

        test_nan_or_inf = Test(vx,
                               specifier=Test.IsInfOrNaN,
                               likely=False,
                               debug=True,
                               tag="nan_or_inf")
        test_nan = Test(vx,
                        specifier=Test.IsNaN,
                        debug=True,
                        tag="is_nan_test")
        test_positive = Comparison(vx,
                                   0,
                                   specifier=Comparison.GreaterOrEqual,
                                   debug=True,
                                   tag="inf_sign")

        test_signaling_nan = Test(vx,
                                  specifier=Test.IsSignalingNaN,
                                  debug=True,
                                  tag="is_signaling_nan")
        return_snan = Statement(
            ExpRaiseReturn(ML_FPE_Invalid,
                           return_value=FP_QNaN(self.precision)))

        # return in case of infinity input
        infty_return = Statement(
            ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)),
                           Return(FP_PlusZero(self.precision))))
        # return in case of specific value input (NaN or inf)
        specific_return = ConditionBlock(
            test_nan,
            ConditionBlock(test_signaling_nan, return_snan,
                           Return(FP_QNaN(self.precision))), infty_return)
        # return in case of standard (non-special) input

        sollya_precision = self.precision.get_sollya_object()
        hi_precision = self.precision.get_field_size() - 3

        # argument reduction
        frac_pi_index = 3
        frac_pi = round(S2**frac_pi_index / pi, sollya_precision, sollya.RN)
        inv_frac_pi = round(pi / S2**frac_pi_index, hi_precision, sollya.RN)
        inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi,
                               sollya_precision, sollya.RN)
        # computing k = E(x * frac_pi)
        vx_pi = Multiplication(vx, frac_pi, precision=self.precision)
        k = NearestInteger(vx_pi, precision=ML_Int32, tag="k", debug=True)
        fk = Conversion(k, precision=self.precision, tag="fk")

        inv_frac_pi_cst = Constant(inv_frac_pi,
                                   tag="inv_frac_pi",
                                   precision=self.precision)
        inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo,
                                      tag="inv_frac_pi_lo",
                                      precision=self.precision)

        red_vx_hi = (vx - inv_frac_pi_cst * fk)
        red_vx_hi.set_attributes(tag="red_vx_hi",
                                 debug=debug_precision,
                                 precision=self.precision)
        red_vx_lo_sub = inv_frac_pi_lo_cst * fk
        red_vx_lo_sub.set_attributes(tag="red_vx_lo_sub",
                                     debug=debug_precision,
                                     unbreakable=True,
                                     precision=self.precision)
        vx_d = Conversion(vx, precision=ML_Binary64, tag="vx_d")
        pre_red_vx = red_vx_hi - inv_frac_pi_lo_cst * fk
        pre_red_vx_d_hi = (vx_d - inv_frac_pi_cst * fk)
        pre_red_vx_d_hi.set_attributes(tag="pre_red_vx_d_hi",
                                       precision=ML_Binary64,
                                       debug=debug_lftolx)
        pre_red_vx_d = pre_red_vx_d_hi - inv_frac_pi_lo_cst * fk
        pre_red_vx_d.set_attributes(tag="pre_red_vx_d",
                                    debug=debug_lftolx,
                                    precision=ML_Binary64)

        modk = Modulo(k,
                      2**(frac_pi_index + 1),
                      precision=ML_Int32,
                      tag="switch_value",
                      debug=True)

        sel_c = Equal(BitLogicAnd(modk, 2**(frac_pi_index - 1)),
                      2**(frac_pi_index - 1))
        red_vx = Select(sel_c, -pre_red_vx, pre_red_vx)
        red_vx.set_attributes(tag="red_vx",
                              debug=debug_precision,
                              precision=self.precision)

        red_vx_d = Select(sel_c, -pre_red_vx_d, pre_red_vx_d)
        red_vx_d.set_attributes(tag="red_vx_d",
                                debug=debug_lftolx,
                                precision=ML_Binary64)

        approx_interval = Interval(-pi / (S2**(frac_pi_index + 1)),
                                   pi / S2**(frac_pi_index + 1))

        Log.report(Log.Info, "approx interval: %s\n" % approx_interval)

        error_goal_approx = S2**-self.precision.get_precision()

        Log.report(Log.Info, "building mathematical polynomial")
        poly_degree_vector = [None] * 2**(frac_pi_index + 1)

        error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai)

        #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme
        polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme

        index_relative = []

        poly_object_vector = [None] * 2**(frac_pi_index + 1)
        for i in range(2**(frac_pi_index + 1)):
            sub_func = cos(sollya.x + i * pi / S2**frac_pi_index)
            degree = int(
                sup(guessdegree(sub_func, approx_interval,
                                error_goal_approx))) + 1

            degree_list = range(degree + 1)
            a_interval = approx_interval
            if i == 0:
                # ad-hoc, TODO: to be cleaned
                degree = 6
                degree_list = range(0, degree + 1, 2)
            elif i % 2**(frac_pi_index) == 2**(frac_pi_index - 1):
                # for pi/2 and 3pi/2, an approx to  sin=cos(pi/2+x)
                # must be generated
                degree_list = range(1, degree + 1, 2)

            if i == 3 or i == 5 or i == 7 or i == 9:
                precision_list = [sollya.binary64
                                  ] + [sollya.binary32] * (degree)
            else:
                precision_list = [sollya.binary32] * (degree + 1)

            poly_degree_vector[i] = degree

            constraint = sollya.absolute
            delta = (2**(frac_pi_index - 3))
            centered_i = (i % 2**(frac_pi_index)) - 2**(frac_pi_index - 1)
            if centered_i < delta and centered_i > -delta and centered_i != 0:
                constraint = sollya.relative
                index_relative.append(i)
            Log.report(
                Log.Info, "generating approximation for %d/%d" %
                (i, 2**(frac_pi_index + 1)))
            poly_object_vector[
                i], _ = Polynomial.build_from_approximation_with_error(
                    sub_func,
                    degree_list,
                    precision_list,
                    a_interval,
                    constraint,
                    error_function=error_function)

        # unified power map for red_sx^n
        upm = {}
        rel_error_list = []

        poly_scheme_vector = [None] * (2**(frac_pi_index + 1))

        for i in range(2**(frac_pi_index + 1)):
            poly_object = poly_object_vector[i]
            poly_precision = self.precision
            if i == 3 or i == 5 or i == 7 or i == 9:
                poly_precision = ML_Binary64
                c0 = Constant(coeff(poly_object.get_sollya_object(), 0),
                              precision=ML_Binary64)
                c1 = Constant(coeff(poly_object.get_sollya_object(), 1),
                              precision=self.precision)
                poly_hi = (c0 + c1 * red_vx)
                poly_hi.set_precision(ML_Binary64)
                red_vx_d_2 = red_vx_d * red_vx_d
                poly_scheme = poly_hi + red_vx_d_2 * polynomial_scheme_builder(
                    poly_object.sub_poly(start_index=2, offset=2),
                    red_vx,
                    unified_precision=self.precision,
                    power_map_=upm)
                poly_scheme.set_attributes(unbreakable=True)
            elif i == 4:
                c1 = Constant(coeff(poly_object.get_sollya_object(), 1),
                              precision=ML_Binary64)
                poly_scheme = c1 * red_vx_d + polynomial_scheme_builder(
                    poly_object.sub_poly(start_index=2),
                    red_vx,
                    unified_precision=self.precision,
                    power_map_=upm)
                poly_scheme.set_precision(ML_Binary64)
            else:
                poly_scheme = polynomial_scheme_builder(
                    poly_object,
                    red_vx,
                    unified_precision=poly_precision,
                    power_map_=upm)
            #if i == 3:
            #  c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision = self.precision)
            #  c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = self.precision)
            #  poly_scheme = (c0 + c1 * red_vx) + polynomial_scheme_builder(poly_object.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm)

            poly_scheme.set_attributes(tag="poly_cos%dpi%d" %
                                       (i, 2**(frac_pi_index)),
                                       debug=debug_precision)
            poly_scheme_vector[i] = poly_scheme

            #try:
            if is_gappa_installed() and i == 3:
                opt_scheme = self.opt_engine.optimization_process(
                    poly_scheme,
                    self.precision,
                    copy=True,
                    fuse_fma=self.fuse_fma)

                tag_map = {}
                self.opt_engine.register_nodes_by_tag(opt_scheme, tag_map)

                gappa_vx = Variable("red_vx",
                                    precision=self.precision,
                                    interval=approx_interval)

                cg_eval_error_copy_map = {
                    tag_map["red_vx"]: gappa_vx,
                    tag_map["red_vx_d"]: gappa_vx,
                }

                print "opt_scheme"
                print opt_scheme.get_str(depth=None,
                                         display_precision=True,
                                         memoization_map={})

                eval_error = self.gappa_engine.get_eval_error_v2(
                    self.opt_engine,
                    opt_scheme,
                    cg_eval_error_copy_map,
                    gappa_filename="red_arg_%d.g" % i)
                poly_range = cos(approx_interval + i * pi / S2**frac_pi_index)
                rel_error_list.append(eval_error / poly_range)

        #for rel_error in rel_error_list:
        #  print sup(abs(rel_error))

        #return

        # case 17
        #poly17 = poly_object_vector[17]
        #c0 = Constant(coeff(poly17.get_sollya_object(), 0), precision = self.precision)
        #c1 = Constant(coeff(poly17.get_sollya_object(), 1), precision = self.precision)
        #poly_scheme_vector[17] = FusedMultiplyAdd(c1, red_vx, c0, specifier = FusedMultiplyAdd.Standard) + polynomial_scheme_builder(poly17.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm)

        half = 2**frac_pi_index
        sub_half = 2**(frac_pi_index - 1)

        # determine if the reduced input is within the second and third quarter (not first nor fourth)
        # to negate the cosine output
        factor_cond = BitLogicAnd(BitLogicXor(
            BitLogicRightShift(modk, frac_pi_index),
            BitLogicRightShift(modk, frac_pi_index - 1)),
                                  1,
                                  tag="factor_cond",
                                  debug=True)

        CM1 = Constant(-1, precision=self.precision)
        C1 = Constant(1, precision=self.precision)
        factor = Select(factor_cond,
                        CM1,
                        C1,
                        tag="factor",
                        debug=debug_precision)
        factor2 = Select(Equal(modk, Constant(sub_half)),
                         CM1,
                         C1,
                         tag="factor2",
                         debug=debug_precision)

        switch_map = {}
        if 0:
            for i in range(2**(frac_pi_index + 1)):
                switch_map[i] = Return(poly_scheme_vector[i])
        else:
            for i in range(2**(frac_pi_index - 1)):
                switch_case = (i, half - i)
                #switch_map[i]      = Return(poly_scheme_vector[i])
                #switch_map[half-i] = Return(-poly_scheme_vector[i])
                if i != 0:
                    switch_case = switch_case + (half + i, 2 * half - i)
                    #switch_map[half+i] = Return(-poly_scheme_vector[i])
                    #switch_map[2*half-i] = Return(poly_scheme_vector[i])
                if poly_scheme_vector[i].get_precision() != self.precision:
                    poly_result = Conversion(poly_scheme_vector[i],
                                             precision=self.precision)
                else:
                    poly_result = poly_scheme_vector[i]
                switch_map[switch_case] = Return(factor * poly_result)
            #switch_map[sub_half] = Return(-poly_scheme_vector[sub_half])
            #switch_map[half + sub_half] = Return(poly_scheme_vector[sub_half])
            switch_map[(sub_half, half + sub_half)] = Return(
                factor2 * poly_scheme_vector[sub_half])

        result = SwitchBlock(modk, switch_map)

        #######################################################################
        #                    LARGE ARGUMENT MANAGEMENT                        #
        #                 (lar: Large Argument Reduction)                     #
        #######################################################################

        # payne and hanek argument reduction for large arguments
        #red_func_name = "payne_hanek_cosfp32" # "payne_hanek_fp32_asm"
        red_func_name = "payne_hanek_fp32_asm"
        payne_hanek_func_op = FunctionOperator(
            red_func_name,
            arg_map={0: FO_Arg(0)},
            require_header=["support_lib/ml_red_arg.h"])
        payne_hanek_func = FunctionObject(red_func_name, [ML_Binary32],
                                          ML_Binary64, payne_hanek_func_op)
        payne_hanek_func_op.declare_prototype = payne_hanek_func
        #large_arg_red = FunctionCall(payne_hanek_func, vx)
        large_arg_red = payne_hanek_func(vx)
        red_bound = S2**20

        cond = Abs(vx) >= red_bound
        cond.set_attributes(tag="cond", likely=False)

        lar_neark = NearestInteger(large_arg_red, precision=ML_Int64)
        lar_modk = Modulo(lar_neark,
                          Constant(16, precision=ML_Int64),
                          tag="lar_modk",
                          debug=True)
        # Modulo is supposed to be already performed (by payne_hanek_cosfp32)
        #lar_modk = NearestInteger(large_arg_red, precision = ML_Int64)
        pre_lar_red_vx = large_arg_red - Conversion(lar_neark,
                                                    precision=ML_Binary64)
        pre_lar_red_vx.set_attributes(precision=ML_Binary64,
                                      debug=debug_lftolx,
                                      tag="pre_lar_red_vx")
        lar_red_vx = Conversion(pre_lar_red_vx,
                                precision=self.precision,
                                debug=debug_precision,
                                tag="lar_red_vx")
        lar_red_vx_lo = Conversion(
            pre_lar_red_vx - Conversion(lar_red_vx, precision=ML_Binary64),
            precision=self.precision)
        lar_red_vx_lo.set_attributes(tag="lar_red_vx_lo",
                                     precision=self.precision)

        lar_k = 3
        # large arg reduction Universal Power Map
        lar_upm = {}
        lar_switch_map = {}
        approx_interval = Interval(-0.5, 0.5)
        for i in range(2**(lar_k + 1)):
            frac_pi = pi / S2**lar_k
            func = cos(frac_pi * i + frac_pi * sollya.x)

            degree = 6
            error_mode = sollya.absolute
            if i % 2**(lar_k) == 2**(lar_k - 1):
                # close to sin(x) cases
                func = -sin(frac_pi * x) if i == 2**(lar_k -
                                                     1) else sin(frac_pi * x)
                degree_list = range(0, degree + 1, 2)
                precision_list = [sollya.binary32] * len(degree_list)
                poly_object, _ = Polynomial.build_from_approximation_with_error(
                    func / x, degree_list, precision_list, approx_interval,
                    error_mode)
                poly_object = poly_object.sub_poly(offset=-1)
            else:
                degree_list = range(degree + 1)
                precision_list = [sollya.binary32] * len(degree_list)
                poly_object, _ = Polynomial.build_from_approximation_with_error(
                    func, degree_list, precision_list, approx_interval,
                    error_mode)

            if i == 3 or i == 5 or i == 7 or i == 9 or i == 11 or i == 13:
                poly_precision = ML_Binary64
                c0 = Constant(coeff(poly_object.get_sollya_object(), 0),
                              precision=ML_Binary64)
                c1 = Constant(coeff(poly_object.get_sollya_object(), 1),
                              precision=self.precision)
                poly_hi = (c0 + c1 * lar_red_vx)
                poly_hi.set_precision(ML_Binary64)
                pre_poly_scheme = poly_hi + polynomial_scheme_builder(
                    poly_object.sub_poly(start_index=2),
                    lar_red_vx,
                    unified_precision=self.precision,
                    power_map_=lar_upm)
                pre_poly_scheme.set_attributes(precision=ML_Binary64)
                poly_scheme = Conversion(pre_poly_scheme,
                                         precision=self.precision)
            elif i == 4 or i == 12:
                c1 = Constant(coeff(poly_object.get_sollya_object(), 1),
                              precision=self.precision)
                c3 = Constant(coeff(poly_object.get_sollya_object(), 3),
                              precision=self.precision)
                c5 = Constant(coeff(poly_object.get_sollya_object(), 5),
                              precision=self.precision)
                poly_hi = polynomial_scheme_builder(
                    poly_object.sub_poly(start_index=3),
                    lar_red_vx,
                    unified_precision=self.precision,
                    power_map_=lar_upm)
                poly_hi.set_attributes(tag="poly_lar_%d_hi" % i,
                                       precision=ML_Binary64)
                poly_scheme = Conversion(FusedMultiplyAdd(
                    c1, lar_red_vx, poly_hi, precision=ML_Binary64) +
                                         c1 * lar_red_vx_lo,
                                         precision=self.precision)
            else:
                poly_scheme = polynomial_scheme_builder(
                    poly_object,
                    lar_red_vx,
                    unified_precision=self.precision,
                    power_map_=lar_upm)
            # poly_scheme = polynomial_scheme_builder(poly_object, lar_red_vx, unified_precision = self.precision, power_map_ = lar_upm)
            poly_scheme.set_attributes(tag="lar_poly_%d" % i,
                                       debug=debug_precision)
            lar_switch_map[(i, )] = Return(poly_scheme)

        lar_result = SwitchBlock(lar_modk, lar_switch_map)

        # main scheme
        #Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m")
        # scheme = Statement(ConditionBlock(cond, lar_result, result))

        Log.report(Log.Info, "Construction of the initial MDL scheme")
        scheme = Statement(pre_red_vx_d, red_vx_lo_sub,
                           ConditionBlock(cond, lar_result, result))

        return scheme
예제 #10
0
    def generate_scheme(self):
        # declaring target and instantiating optimization engine
        precision_ptr = self.get_input_precision(0)
        index_format = self.get_input_precision(2)

        dst = self.implementation.add_input_variable("dst", precision_ptr)
        src = self.implementation.add_input_variable("src", precision_ptr)
        n = self.implementation.add_input_variable("len", index_format)

        i = Variable("i", precision=index_format, var_type=Variable.Local)
        CU1 = Constant(1, precision=index_format)
        CU0 = Constant(0, precision=index_format)
        inc = i + CU1

        elt_input = TableLoad(src, i, precision=self.precision)

        local_exp = Variable("local_exp",
                             precision=self.precision,
                             var_type=Variable.Local)

        if self.use_libm_function:
            libm_exp_operator = FunctionOperator("expf", arity=1)
            libm_exp = FunctionObject("expf", [ML_Binary32], ML_Binary32,
                                      libm_exp_operator)

            elt_result = ReferenceAssign(local_exp, libm_exp(elt_input))
        else:
            exponential_args = ML_Exponential.get_default_args(
                precision=self.precision,
                libm_compliant=False,
                debug=False,
            )

            meta_exponential = ML_Exponential(exponential_args)
            exponential_scheme = meta_exponential.generate_scheme()

            elt_result = inline_function(
                exponential_scheme,
                local_exp,
                {meta_exponential.implementation.arg_list[0]: elt_input},
            )

        elt_acc = Variable("elt_acc",
                           precision=self.precision,
                           var_type=Variable.Local)

        exp_loop = Loop(
            ReferenceAssign(i, CU0),
            i < n,
            Statement(ReferenceAssign(local_exp, 0), elt_result,
                      TableStore(local_exp, dst, i, precision=ML_Void),
                      ReferenceAssign(elt_acc, elt_acc + local_exp),
                      ReferenceAssign(i, i + CU1)),
        )

        sum_rcp = Division(1,
                           elt_acc,
                           precision=self.precision,
                           tag="sum_rcp",
                           debug=debug_multi)

        div_loop = Loop(
            ReferenceAssign(i, CU0),
            i < n,
            Statement(
                TableStore(Multiplication(
                    TableLoad(dst, i, precision=self.precision), sum_rcp),
                           dst,
                           i,
                           precision=ML_Void), ReferenceAssign(i, inc)),
        )

        main_scheme = Statement(ReferenceAssign(elt_acc, 0), exp_loop, sum_rcp,
                                div_loop)

        return main_scheme
예제 #11
0
  def generate_datafile_testbench(self, tc_list, io_map, input_signals, output_signals, time_step, test_fname="test.input"):
    """ Generate testbench with input and output data externalized in
        a data file """
    # textio function to read hexadecimal text
    def FCT_HexaRead_gen(input_format):
        legalized_input_format = input_format
        FCT_HexaRead = FunctionObject("hread", [HDL_LINE, legalized_input_format], ML_Void, FunctionOperator("hread", void_function=True, arity=2))
        return FCT_HexaRead
    # textio function to read binary text
    FCT_Read = FunctionObject("read", [HDL_LINE, ML_StdLogic], ML_Void, FunctionOperator("read", void_function=True, arity=2))
    input_line = Variable("input_line", precision=HDL_LINE, var_type=Variable.Local)

    # building ordered list of input and output signal names
    input_signal_list = [sname for sname in input_signals.keys()]
    input_statement = Statement()
    for input_name in input_signal_list:
        input_format = input_signals[input_name].precision
        input_var = Variable(
            "v_" + input_name,
            precision=input_format,
            var_type=Variable.Local)
        if input_format is ML_StdLogic:
            input_statement.add(FCT_Read(input_line, input_var))
        else:
            input_statement.add(FCT_HexaRead_gen(input_format)(input_line, input_var))
        input_statement.add(ReferenceAssign(input_signals[input_name], input_var))

    output_signal_list = [sname for sname in output_signals.keys()]
    output_statement = Statement()
    for output_name in output_signal_list:
        output_format = output_signals[output_name].precision
        output_var = Variable(
            "v_" + output_name,
            precision=output_format,
            var_type=Variable.Local)
        if output_format is ML_StdLogic:
            output_statement.add(FCT_Read(input_line, output_var))
        else:
            output_statement.add(FCT_HexaRead_gen(output_format)(input_line, output_var))

        output_signal = output_signals[output_name]
        #value_msg = get_output_value_msg(output_signal, output_value)
        test_pass_cond, check_statement = get_output_check_statement(output_signal, output_name, output_var)

        input_msg = multi_Concatenation(*tuple(sum([[" %s=" % input_tag, signal_str_conversion(input_signals[input_tag], input_signals[input_tag].precision)] for input_tag in input_signal_list], [])))

        output_statement.add(check_statement)
        assert_statement = Assert(
            test_pass_cond,
            multi_Concatenation(
                "unexpected value for inputs ",
                input_msg,
                " expecting :",
                signal_str_conversion(output_var, output_format),
                " got :",
                signal_str_conversion(output_signal, output_format),
               precision = ML_String
            ),
            severity=Assert.Failure
        )
        output_statement.add(assert_statement)

    self_component = self.implementation.get_component_object()
    self_instance = self_component(io_map = io_map, tag = "tested_entity")
    test_statement = Statement()

    DATA_FILE_NAME = test_fname

    with open(DATA_FILE_NAME, "w") as data_file:
        # dumping column tags
        data_file.write("# " + " ".join(input_signal_list + output_signal_list) + "\n")

        def get_raw_cst_string(cst_format, cst_value):
            size = int((cst_format.get_bit_size() + 3) / 4)
            return ("{:x}").format(cst_format.get_base_format().get_integer_coding(cst_value)).zfill(size)

        for input_values, output_values in tc_list:
            # TODO; generate test data file
            cst_list = []
            for input_name in input_signal_list:
                input_value = input_values[input_name]
                input_format = input_signals[input_name].get_precision()
                cst_list.append(get_raw_cst_string(input_format, input_value))

            for output_name in output_signal_list:
                output_value = output_values[output_name]
                output_format = output_signals[output_name].get_precision()
                cst_list.append(get_raw_cst_string(output_format, output_value))
            # dumping line into file
            data_file.write(" ".join(cst_list) + "\n")

    input_stream = Variable("data_file", precision=HDL_FILE, var_type=Variable.Local)
    file_status = Variable("file_status", precision=HDL_OPEN_FILE_STATUS, var_type=Variable.Local)
    FCT_EndFile = FunctionObject("endfile", [HDL_FILE], ML_Bool, FunctionOperator("endfile", arity=1)) 
    FCT_OpenFile = FunctionObject(
        "FILE_OPEN", [HDL_OPEN_FILE_STATUS, HDL_FILE, ML_String], ML_Void,
        FunctionOperator(
            "FILE_OPEN",
            arg_map={0: FO_Arg(0), 1: FO_Arg(1), 2: FO_Arg(2), 3: "READ_MODE"},
            void_function=True))
    FCT_ReadLine =  FunctionObject(
        "readline", [HDL_FILE, HDL_LINE], ML_Void,
        FunctionOperator("readline", void_function=True, arity=2))

    reset_statement = self.get_reset_statement(io_map, time_step)
    OPEN_OK = Constant("OPEN_OK", precision=HDL_OPEN_FILE_STATUS)

    testbench = CodeEntity("testbench")
    test_process = Process(
        reset_statement,
        FCT_OpenFile(file_status, input_stream, DATA_FILE_NAME),
        ConditionBlock(
            Comparison(file_status, OPEN_OK, specifier=Comparison.NotEqual),
          Assert(
            Constant(0, precision=ML_Bool),
            " \"failed to open file {}\"".format(DATA_FILE_NAME),
            severity=Assert.Failure
          )
        ),
        # consume legend line
        FCT_ReadLine(input_stream, input_line),
        WhileLoop(
            LogicalNot(FCT_EndFile(input_stream)),
            Statement(
                FCT_ReadLine(input_stream, input_line),
                input_statement,
                Wait(time_step * (self.stage_num + 2)),
                output_statement,
            ),
        ),
      # end of test
      Assert(
        Constant(0, precision = ML_Bool),
        " \"end of test, no error encountered \"",
        severity = Assert.Warning
      ),
      # infinite end loop
        WhileLoop(
            Constant(1, precision=ML_Bool),
            Statement(
                Wait(time_step * (self.stage_num + 2)),
            )
        )
    )

    testbench_scheme = Statement(
      self_instance,
      test_process
    )

    if self.pipelined:
        half_time_step = time_step / 2
        assert (half_time_step * 2) == time_step
        # adding clock process for pipelined bench
        clk_process = Process(
            Statement(
                ReferenceAssign(
                    io_map["clk"],
                    Constant(1, precision = ML_StdLogic)
                ),
                Wait(half_time_step),
                ReferenceAssign(
                    io_map["clk"],
                    Constant(0, precision = ML_StdLogic)
                ),
                Wait(half_time_step),
            )
        )
        testbench_scheme.push(clk_process)

    testbench.add_process(testbench_scheme)

    return [testbench]
예제 #12
0
    def generate_bench_wrapper(self,
                               test_num=1,
                               loop_num=100000,
                               test_ranges=[Interval(-1.0, 1.0)],
                               debug=False):
        # interval where the array lenght is chosen from (randomly)
        index_range = self.test_index_range

        auto_test = CodeFunction("bench_wrapper", output_format=ML_Binary64)

        tested_function = self.implementation.get_function_object()
        function_name = self.implementation.get_name()

        failure_report_op = FunctionOperator("report_failure")
        failure_report_function = FunctionObject("report_failure", [], ML_Void,
                                                 failure_report_op)

        printf_success_op = FunctionOperator(
            "printf",
            arg_map={0: "\"test successful %s\\n\"" % function_name},
            void_function=True)
        printf_success_function = FunctionObject("printf", [], ML_Void,
                                                 printf_success_op)

        output_precision = FormatAttributeWrapper(self.precision, ["volatile"])

        test_total = test_num

        # number of arrays expected as inputs for tested_function
        NUM_INPUT_ARRAY = 1
        # position of the input array in tested_function operands (generally
        # equals to 1 as to 0-th input is often the destination array)
        INPUT_INDEX_OFFSET = 1

        # concatenating standard test array at the beginning of randomly
        # generated array
        TABLE_SIZE_VALUES = [
            len(std_table) for std_table in self.standard_test_cases
        ] + [
            random.randrange(index_range[0], index_range[1] + 1)
            for i in range(test_num)
        ]
        OFFSET_VALUES = [sum(TABLE_SIZE_VALUES[:i]) for i in range(test_total)]

        table_size_offset_array = generate_2d_table(
            test_total,
            2,
            ML_UInt32,
            self.uniquify_name("table_size_array"),
            value_gen=(lambda row_id:
                       (TABLE_SIZE_VALUES[row_id], OFFSET_VALUES[row_id])))

        INPUT_ARRAY_SIZE = sum(TABLE_SIZE_VALUES)

        # TODO/FIXME: implement proper input range depending on input index
        # assuming a single input array
        input_precisions = [self.get_input_precision(1).get_data_precision()]
        rng_map = [
            get_precision_rng(precision, inf(test_range), sup(test_range))
            for precision, test_range in zip(input_precisions, test_ranges)
        ]

        # generated table of inputs
        input_tables = [
            generate_1d_table(
                INPUT_ARRAY_SIZE,
                self.get_input_precision(INPUT_INDEX_OFFSET +
                                         table_id).get_data_precision(),
                self.uniquify_name("input_table_arg%d" % table_id),
                value_gen=(
                    lambda _: input_precisions[table_id].round_sollya_object(
                        rng_map[table_id].get_new_value(), sollya.RN)))
            for table_id in range(NUM_INPUT_ARRAY)
        ]

        # generate output_array
        output_array = generate_1d_table(
            INPUT_ARRAY_SIZE,
            output_precision,
            self.uniquify_name("output_array"),
            #value_gen=(lambda _: FP_QNaN(self.precision))
            value_gen=(lambda _: None),
            const=False,
            empty=True)

        # accumulate element number
        acc_num = Variable("acc_num",
                           precision=ML_Int64,
                           var_type=Variable.Local)

        def empty_post_statement_gen(input_tables, output_array,
                                     table_size_offset_array, array_offset,
                                     array_len, test_id):
            return Statement()

        test_loop = self.get_array_test_wrapper(test_total, tested_function,
                                                table_size_offset_array,
                                                input_tables, output_array,
                                                acc_num,
                                                empty_post_statement_gen)

        timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local)
        printf_timing_op = FunctionOperator(
            "printf",
            arg_map={
                0:
                "\"%s %%\"PRIi64\" elts computed in %%\"PRIi64\" nanoseconds => %%.3f CPE \\n\""
                % function_name,
                1:
                FO_Arg(0),
                2:
                FO_Arg(1),
                3:
                FO_Arg(2)
            },
            void_function=True)
        printf_timing_function = FunctionObject(
            "printf", [ML_Int64, ML_Int64, ML_Binary64], ML_Void,
            printf_timing_op)

        vj = Variable("j", precision=ML_Int32, var_type=Variable.Local)
        loop_num_cst = Constant(loop_num, precision=ML_Int32, tag="loop_num")
        loop_increment = 1

        # bench measure of clock per element
        cpe_measure = Division(
            Conversion(timer, precision=ML_Binary64),
            Conversion(acc_num, precision=ML_Binary64),
            precision=ML_Binary64,
            tag="cpe_measure",
        )

        # common test scheme between scalar and vector functions
        test_scheme = Statement(
            self.processor.get_init_timestamp(),
            ReferenceAssign(timer, self.processor.get_current_timestamp()),
            ReferenceAssign(acc_num, 0),
            Loop(
                ReferenceAssign(vj, Constant(0, precision=ML_Int32)),
                vj < loop_num_cst,
                Statement(test_loop, ReferenceAssign(vj,
                                                     vj + loop_increment))),
            ReferenceAssign(
                timer,
                Subtraction(self.processor.get_current_timestamp(),
                            timer,
                            precision=ML_Int64)),
            printf_timing_function(
                Conversion(acc_num, precision=ML_Int64),
                timer,
                cpe_measure,
            ),
            Return(cpe_measure),
            # Return(Constant(0, precision = ML_Int32))
        )
        auto_test.set_scheme(test_scheme)
        return FunctionGroup([auto_test])
예제 #13
0
    def generate_array_check_loop(self, input_tables, output_array,
                                  table_size_offset_array, array_offset,
                                  array_len, test_id):
        # internal array iterator index
        vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local)

        printf_input_function = self.get_printf_input_function()

        printf_error_template = "printf(\"max %s error is %s \\n\", %s)" % (
            self.function_name,
            self.precision.get_display_format().format_string,
            self.precision.get_display_format().pre_process_fct("{0}"))
        printf_error_op = TemplateOperatorFormat(printf_error_template,
                                                 arity=1,
                                                 void_function=True,
                                                 require_header=["stdio.h"])

        printf_error_function = FunctionObject("printf", [self.precision],
                                               ML_Void, printf_error_op)

        printf_max_op = FunctionOperator(
            "printf",
            arg_map={
                0:
                "\"max %s error is reached at input number %s \\n \"" %
                (self.function_name, "%d"),
                1:
                FO_Arg(0)
            },
            void_function=True,
            require_header=["stdio.h"])
        printf_max_function = FunctionObject("printf", [self.precision],
                                             ML_Void, printf_max_op)

        NUM_INPUT_ARRAY = len(input_tables)

        # generate the expected table for the whole multi-array
        expected_table = self.generate_expected_table(input_tables,
                                                      table_size_offset_array)

        # inputs for the (vj)-th entry of the sub-arrat
        local_inputs = tuple(
            TableLoad(input_tables[in_id], array_offset + vj)
            for in_id in range(NUM_INPUT_ARRAY))
        # expected values for the (vj)-th entry of the sub-arrat
        expected_values = [
            TableLoad(expected_table, array_offset + vj, i)
            for i in range(self.accuracy.get_num_output_value())
        ]
        # local result for the (vj)-th entry of the sub-arrat
        local_result = TableLoad(output_array, array_offset + vj)

        if self.break_error:
            return_statement_break = Statement(
                printf_input_function(*((vj, ) + local_inputs +
                                        (local_result, ))),
                self.accuracy.get_output_print_call(self.function_name,
                                                    output_values))
        else:
            return_statement_break = Statement(
                printf_input_function(*((vj, ) + local_inputs +
                                        (local_result, ))),
                self.accuracy.get_output_print_call(self.function_name,
                                                    expected_values),
                Return(Constant(1, precision=ML_Int32)))

        # loop implementation to check sub-array array_offset
        # results validity
        check_array_loop = Loop(
            ReferenceAssign(vj, 0), vj < array_len,
            Statement(
                ConditionBlock(
                    self.accuracy.get_output_check_test(
                        local_result, expected_values),
                    return_statement_break),
                ReferenceAssign(vj, vj + 1),
            ))
        return check_array_loop
예제 #14
0
 def FCT_HexaRead_gen(input_format):
     legalized_input_format = input_format
     FCT_HexaRead = FunctionObject("hread", [HDL_LINE, legalized_input_format], ML_Void, FunctionOperator("hread", void_function=True, arity=2))
     return FCT_HexaRead
예제 #15
0
    def __init__(self,
                 precision=ML_Binary32,
                 abs_accuracy=S2**-24,
                 libm_compliant=True,
                 debug_flag=False,
                 fuse_fma=True,
                 fast_path_extract=True,
                 target=GenericProcessor(),
                 output_file="log1pf.c",
                 function_name="log1pf"):
        # declaring CodeFunction and retrieving input variable
        self.function_name = function_name
        self.precision = precision
        self.processor = target
        func_implementation = CodeFunction(self.function_name,
                                           output_format=self.precision)
        vx = func_implementation.add_input_variable("x", self.precision)

        sollya_precision = self.precision.sollya_object

        # debug utilities
        debugf = ML_Debug(display_format="%f")
        debuglf = ML_Debug(display_format="%lf")
        debugx = ML_Debug(display_format="%x")
        debuglx = ML_Debug(display_format="%\"PRIx64\"", )
        debugd = ML_Debug(display_format="%d",
                          pre_process=lambda v: "(int) %s" % v)
        debugld = ML_Debug(display_format="%ld")
        #debug_lftolx  = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v)
        debug_lftolx = ML_Debug(
            display_format="%\"PRIx64\" ev=%x",
            pre_process=lambda v:
            "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v)
        debug_ddtolx = ML_Debug(
            display_format="%\"PRIx64\" %\"PRIx64\"",
            pre_process=lambda v:
            "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" %
            (v, v))
        debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}",
                            pre_process=lambda v: "%s.hi, %s.lo" % (v, v))

        # local overloading of RaiseReturn operation
        def ExpRaiseReturn(*args, **kwords):
            kwords["arg_value"] = vx
            kwords["function_name"] = self.function_name
            return RaiseReturn(*args, **kwords)

        log2_hi_value = round(
            log(2),
            self.precision.get_field_size() -
            (self.precision.get_exponent_size() + 1), sollya.RN)
        log2_lo_value = round(
            log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN)

        log2_hi = Constant(log2_hi_value, precision=self.precision)
        log2_lo = Constant(log2_lo_value, precision=self.precision)

        vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd)

        int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32

        # retrieving processor inverse approximation table
        dummy_var = Variable("dummy", precision=self.precision)
        dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision)
        inv_approx_table = self.processor.get_recursive_implementation(
            dummy_div_seed,
            language=None,
            table_getter=lambda self: self.approx_table_map)

        # table creation
        table_index_size = 7
        log_table = ML_Table(dimensions=[2**table_index_size, 2],
                             storage_precision=self.precision)
        log_table[0][0] = 0.0
        log_table[0][1] = 0.0
        for i in xrange(1, 2**table_index_size):
            #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1
            inv_value = (1.0 + (inv_approx_table[i][0] / S2**9)) * S2**-1
            value_high = round(
                log(inv_value),
                self.precision.get_field_size() -
                (self.precision.get_exponent_size() + 1), sollya.RN)
            value_low = round(
                log(inv_value) - value_high, sollya_precision, sollya.RN)
            log_table[i][0] = value_high
            log_table[i][1] = value_low

        vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd)

        # case close to 0: ctz
        ctz_exp_limit = -7
        ctz_cond = vx_exp < ctz_exp_limit
        ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit)

        ctz_poly_degree = sup(
            guessdegree(
                log1p(sollya.x) / sollya.x, ctz_interval, S2**
                -(self.precision.get_field_size() + 1))) + 1
        ctz_poly_object = Polynomial.build_from_approximation(
            log1p(sollya.x) / sollya.x, ctz_poly_degree,
            [self.precision] * (ctz_poly_degree + 1), ctz_interval,
            sollya.absolute)

        print "generating polynomial evaluation scheme"
        ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme(
            ctz_poly_object, vx, unified_precision=self.precision)
        ctz_poly.set_attributes(tag="ctz_poly", debug=debug_lftolx)

        ctz_result = vx * ctz_poly

        neg_input = Comparison(vx,
                               -1,
                               likely=False,
                               specifier=Comparison.Less,
                               debug=debugd,
                               tag="neg_input")
        vx_nan_or_inf = Test(vx,
                             specifier=Test.IsInfOrNaN,
                             likely=False,
                             debug=debugd,
                             tag="nan_or_inf")
        vx_snan = Test(vx,
                       specifier=Test.IsSignalingNaN,
                       likely=False,
                       debug=debugd,
                       tag="snan")
        vx_inf = Test(vx,
                      specifier=Test.IsInfty,
                      likely=False,
                      debug=debugd,
                      tag="inf")
        vx_subnormal = Test(vx,
                            specifier=Test.IsSubnormal,
                            likely=False,
                            debug=debugd,
                            tag="vx_subnormal")

        log_function_code = CodeFunction(
            "new_log", [Variable("x", precision=ML_Binary64)],
            output_format=ML_Binary64)
        log_call_generator = FunctionOperator(
            log_function_code.get_name(),
            arity=1,
            output_precision=ML_Binary64,
            declare_prototype=log_function_code)
        newlog_function = FunctionObject(log_function_code.get_name(),
                                         (ML_Binary64, ), ML_Binary64,
                                         log_call_generator)

        # case away from 0.0
        pre_vxp1 = vx + 1.0
        pre_vxp1.set_attributes(tag="pre_vxp1", debug=debug_lftolx)
        pre_vxp1_exp = ExponentExtraction(pre_vxp1,
                                          tag="pre_vxp1_exp",
                                          debug=debugd)
        cm500 = Constant(-500, precision=ML_Int32)
        c0 = Constant(0, precision=ML_Int32)
        cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size() -
                                          2)
        scaling_factor_exp = Select(cond_scaling, cm500, c0)
        scaling_factor = ExponentInsertion(scaling_factor_exp,
                                           precision=self.precision,
                                           tag="scaling_factor")

        vxp1 = pre_vxp1 * scaling_factor
        vxp1.set_attributes(tag="vxp1", debug=debug_lftolx)
        vxp1_exp = ExponentExtraction(vxp1, tag="vxp1_exp", debug=debugd)

        vxp1_inv = DivisionSeed(vxp1,
                                precision=self.precision,
                                tag="vxp1_inv",
                                debug=debug_lftolx,
                                silent=True)

        vxp1_dirty_inv = ExponentInsertion(-vxp1_exp,
                                           precision=self.precision,
                                           tag="vxp1_dirty_inv",
                                           debug=debug_lftolx)

        table_index = BitLogicAnd(BitLogicRightShift(
            TypeCast(vxp1, precision=int_precision, debug=debuglx),
            self.precision.get_field_size() - 7,
            debug=debuglx),
                                  0x7f,
                                  tag="table_index",
                                  debug=debuglx)

        # argument reduction
        # TODO: detect if single operand inverse seed is supported by the targeted architecture
        pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv,
                                                          precision=ML_UInt64),
                                                 Constant(-2,
                                                          precision=ML_UInt64),
                                                 precision=ML_UInt64),
                                     precision=self.precision,
                                     tag="pre_arg_red_index",
                                     debug=debug_lftolx)
        arg_red_index = Select(Equal(table_index, 0),
                               vxp1_dirty_inv,
                               pre_arg_red_index,
                               tag="arg_red_index",
                               debug=debug_lftolx)

        red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0,
                          (arg_red_index * vx - 1.0) + arg_red_index)
        #red_vxp1 = arg_red_index * vxp1 - 1.0
        red_vxp1.set_attributes(tag="red_vxp1", debug=debug_lftolx)

        log_inv_lo = TableLoad(log_table,
                               table_index,
                               1,
                               tag="log_inv_lo",
                               debug=debug_lftolx)
        log_inv_hi = TableLoad(log_table,
                               table_index,
                               0,
                               tag="log_inv_hi",
                               debug=debug_lftolx)

        inv_err = S2**-6  # TODO: link to target DivisionSeed precision

        print "building mathematical polynomial"
        approx_interval = Interval(-inv_err, inv_err)
        poly_degree = sup(
            guessdegree(
                log(1 + sollya.x) / sollya.x, approx_interval, S2**
                -(self.precision.get_field_size() + 1))) + 1
        global_poly_object = Polynomial.build_from_approximation(
            log(1 + sollya.x) / sollya.x, poly_degree,
            [self.precision] * (poly_degree + 1), approx_interval,
            sollya.absolute)
        poly_object = global_poly_object.sub_poly(start_index=1)

        print "generating polynomial evaluation scheme"
        _poly = PolynomialSchemeEvaluator.generate_horner_scheme(
            poly_object, red_vxp1, unified_precision=self.precision)
        _poly.set_attributes(tag="poly", debug=debug_lftolx)
        print global_poly_object.get_sollya_object()

        vxp1_inv_exp = ExponentExtraction(vxp1_inv,
                                          tag="vxp1_inv_exp",
                                          debug=debugd)
        corr_exp = -vxp1_exp + scaling_factor_exp  # vxp1_inv_exp

        #poly = (red_vxp1) * (1 +  _poly)
        #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True)

        pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly +
                                    (-corr_exp * log2_lo - log_inv_lo))
        pre_result.set_attributes(tag="pre_result", debug=debug_lftolx)
        exact_log2_hi_exp = -corr_exp * log2_hi
        exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp",
                                         debug=debug_lftolx,
                                         prevent_optimization=True)
        #std_result =  exact_log2_hi_exp + pre_result

        exact_log2_lo_exp = -corr_exp * log2_lo
        exact_log2_lo_exp.set_attributes(
            tag="exact_log2_lo_exp",
            debug=debug_lftolx)  #, prevent_optimization = True)

        init = exact_log2_lo_exp - log_inv_lo
        init.set_attributes(tag="init",
                            debug=debug_lftolx,
                            prevent_optimization=True)
        fma0 = (red_vxp1 * _poly + init)  # - log_inv_lo)
        fma0.set_attributes(tag="fma0", debug=debug_lftolx)
        step0 = fma0
        step0.set_attributes(
            tag="step0", debug=debug_lftolx)  #, prevent_optimization = True)
        step1 = step0 + red_vxp1
        step1.set_attributes(tag="step1",
                             debug=debug_lftolx,
                             prevent_optimization=True)
        step2 = -log_inv_hi + step1
        step2.set_attributes(tag="step2",
                             debug=debug_lftolx,
                             prevent_optimization=True)
        std_result = exact_log2_hi_exp + step2
        std_result.set_attributes(tag="std_result",
                                  debug=debug_lftolx,
                                  prevent_optimization=True)

        # main scheme
        print "MDL scheme"
        pre_scheme = ConditionBlock(
            neg_input,
            Statement(ClearException(), Raise(ML_FPE_Invalid),
                      Return(FP_QNaN(self.precision))),
            ConditionBlock(
                vx_nan_or_inf,
                ConditionBlock(
                    vx_inf,
                    Statement(
                        ClearException(),
                        Return(FP_PlusInfty(self.precision)),
                    ),
                    Statement(ClearException(),
                              ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)),
                              Return(FP_QNaN(self.precision)))),
                ConditionBlock(
                    vx_subnormal, Return(vx),
                    ConditionBlock(ctz_cond, Statement(Return(ctz_result), ),
                                   Statement(Return(std_result))))))
        scheme = pre_scheme

        #print scheme.get_str(depth = None, display_precision = True)

        opt_eng = OptimizationEngine(self.processor)

        # fusing FMA
        print "MDL fusing FMA"
        scheme = opt_eng.fuse_multiply_add(scheme, silence=True)

        print "MDL abstract scheme"
        opt_eng.instantiate_abstract_precision(scheme, None)

        #print scheme.get_str(depth = None, display_precision = True)

        print "MDL instantiated scheme"
        opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32)

        print "subexpression sharing"
        opt_eng.subexpression_sharing(scheme)

        print "silencing operation"
        opt_eng.silence_fp_operations(scheme)

        # registering scheme as function implementation
        func_implementation.set_scheme(scheme)

        # check processor support
        opt_eng.check_processor_support(scheme)

        # factorizing fast path
        opt_eng.factorize_fast_path(scheme)
        #print scheme.get_str(depth = None, display_precision = True)

        cg = CCodeGenerator(self.processor,
                            declare_cst=False,
                            disable_debug=not debug_flag,
                            libm_compliant=libm_compliant)
        self.result = func_implementation.get_definition(cg,
                                                         C_Code,
                                                         static_cst=True)
        self.result.add_header("support_lib/ml_special_values.h")
        self.result.add_header("math.h")
        self.result.add_header("stdio.h")
        self.result.add_header("inttypes.h")
        #print self.result.get(cg)
        output_stream = open("%s.c" % func_implementation.get_name(), "w")
        output_stream.write(self.result.get(cg))
        output_stream.close()
예제 #16
0
    def generate_bench(self, processor, test_num=1000, unroll_factor=10):
        """ generate performance bench for self.op_class """
        initial_inputs = [
            Constant(random.uniform(inf(self.init_interval),
                                    sup(self.init_interval)),
                     precision=precision)
            for i, precision in enumerate(self.input_precisions)
        ]

        var_inputs = [
            Variable("var_%d" % i,
                     precision=FormatAttributeWrapper(precision, ["volatile"]),
                     var_type=Variable.Local)
            for i, precision in enumerate(self.input_precisions)
        ]

        printf_timing_op = FunctionOperator(
            "printf",
            arg_map={
                0: "\"%s[%s] %%lld elts computed "\
                   "in %%lld cycles =>\\n     %%.3f CPE \\n\"" %
                (
                    self.bench_name,
                    self.output_precision.get_display_format()
                ),
                1: FO_Arg(0),
                2: FO_Arg(1),
                3: FO_Arg(2),
                4: FO_Arg(3)
            }, void_function=True
        )
        printf_timing_function = FunctionObject(
            "printf", [self.output_precision, ML_Int64, ML_Int64, ML_Binary64],
            ML_Void, printf_timing_op)
        timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local)

        void_function_op = FunctionOperator("(void)",
                                            arity=1,
                                            void_function=True)
        void_function = FunctionObject("(void)", [self.output_precision],
                                       ML_Void, void_function_op)

        # initialization of operation inputs
        init_assign = metaop.Statement()
        for var_input, init_value in zip(var_inputs, initial_inputs):
            init_assign.push(ReferenceAssign(var_input, init_value))

        # test loop
        loop_i = Variable("i", precision=ML_Int64, var_type=Variable.Local)
        test_num_cst = Constant(test_num / unroll_factor,
                                precision=ML_Int64,
                                tag="test_num")

        # Goal build a chain of dependant operation to measure
        # elementary operation latency
        local_inputs = tuple(var_inputs)
        local_result = self.op_class(*local_inputs,
                                     precision=self.output_precision,
                                     unbreakable=True)
        for i in range(unroll_factor - 1):
            local_inputs = tuple([local_result] + var_inputs[1:])
            local_result = self.op_class(*local_inputs,
                                         precision=self.output_precision,
                                         unbreakable=True)
        # renormalisation
        local_result = self.renorm_function(local_result)

        # variable assignation to build dependency chain
        var_assign = Statement()
        var_assign.push(ReferenceAssign(var_inputs[0], local_result))
        final_value = var_inputs[0]

        # loop increment value
        loop_increment = 1

        test_loop = Loop(
            ReferenceAssign(loop_i, Constant(0, precision=ML_Int32)),
            loop_i < test_num_cst,
            Statement(var_assign,
                      ReferenceAssign(loop_i, loop_i + loop_increment)),
        )

        # bench scheme
        test_scheme = Statement(
            ReferenceAssign(timer, processor.get_current_timestamp()),
            init_assign,
            test_loop,
            ReferenceAssign(
                timer,
                Subtraction(processor.get_current_timestamp(),
                            timer,
                            precision=ML_Int64)),
            # prevent intermediary variable simplification
            void_function(final_value),
            printf_timing_function(
                final_value, Constant(test_num, precision=ML_Int64), timer,
                Division(Conversion(timer, precision=ML_Binary64),
                         Constant(test_num, precision=ML_Binary64),
                         precision=ML_Binary64))
            # ,Return(Constant(0, precision = ML_Int32))
        )

        return test_scheme
예제 #17
0
  def generate_scheme(self):
    vx = self.implementation.add_input_variable("x", self.precision) 
    sollya_precision = self.get_input_precision().sollya_object

    # local overloading of RaiseReturn operation
    def ExpRaiseReturn(*args, **kwords):
        kwords["arg_value"] = vx
        kwords["function_name"] = self.function_name
        return RaiseReturn(*args, **kwords)


    log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN)
    log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN)

    log2_hi = Constant(log2_hi_value, precision = self.precision)
    log2_lo = Constant(log2_lo_value, precision = self.precision)

    vx_exp  = ExponentExtraction(vx, tag = "vx_exp", debug = debugd)

    int_precision = self.precision.get_integer_format()

    # retrieving processor inverse approximation table
    dummy_var = Variable("dummy", precision = self.precision)
    dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision)
    inv_approx_table = self.processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map)

    # table creation
    table_index_size = 7
    log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision)
    log_table[0][0] = 0.0
    log_table[0][1] = 0.0
    for i in range(1, 2**table_index_size):
        #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1
        inv_value = inv_approx_table[i] # (1.0 + (inv_approx_table[i] / S2**9) ) * S2**-1
        value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN)
        value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN)
        log_table[i][0] = value_high
        log_table[i][1] = value_low


    vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd)

    # case close to 0: ctz
    ctz_exp_limit = -7
    ctz_cond = vx_exp < ctz_exp_limit
    ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit)

    ctz_poly_degree = sup(guessdegree(log1p(sollya.x)/sollya.x, ctz_interval, S2**-(self.precision.get_field_size()+1))) + 1
    ctz_poly_object = Polynomial.build_from_approximation(log1p(sollya.x)/sollya.x, ctz_poly_degree, [self.precision]*(ctz_poly_degree+1), ctz_interval, sollya.absolute)

    Log.report(Log.Info, "generating polynomial evaluation scheme")
    ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme(ctz_poly_object, vx, unified_precision = self.precision)
    ctz_poly.set_attributes(tag = "ctz_poly", debug = debug_lftolx)

    ctz_result = vx * ctz_poly

    neg_input = Comparison(vx, -1, likely = False, specifier = Comparison.Less, debug = debugd, tag = "neg_input")
    vx_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = debugd, tag = "nan_or_inf")
    vx_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False, debug = debugd, tag = "snan")
    vx_inf  = Test(vx, specifier = Test.IsInfty, likely = False, debug = debugd, tag = "inf")
    vx_subnormal = Test(vx, specifier = Test.IsSubnormal, likely = False, debug = debugd, tag = "vx_subnormal")
    
    log_function_code = CodeFunction("new_log", [Variable("x", precision = ML_Binary64)], output_format = ML_Binary64) 
    log_call_generator = FunctionOperator(log_function_code.get_name(), arity = 1, output_precision = ML_Binary64, declare_prototype = log_function_code)
    newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64,), ML_Binary64, log_call_generator)


    # case away from 0.0
    pre_vxp1 = vx + 1.0
    pre_vxp1.set_attributes(tag = "pre_vxp1", debug = debug_lftolx)
    pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag = "pre_vxp1_exp", debug = debugd)
    cm500 = Constant(-500, precision = ML_Int32)
    c0 = Constant(0, precision = ML_Int32)
    cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size()-2)
    scaling_factor_exp = Select(cond_scaling, cm500, c0)
    scaling_factor = ExponentInsertion(scaling_factor_exp, precision = self.precision, tag = "scaling_factor")

    vxp1 = pre_vxp1 * scaling_factor
    vxp1.set_attributes(tag = "vxp1", debug = debug_lftolx)
    vxp1_exp = ExponentExtraction(vxp1, tag = "vxp1_exp", debug = debugd)

    vxp1_inv = ReciprocalSeed(vxp1, precision = self.precision, tag = "vxp1_inv", debug = debug_lftolx, silent = True)

    vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision = self.precision, tag = "vxp1_dirty_inv", debug = debug_lftolx)

    table_index = BitLogicAnd(BitLogicRightShift(TypeCast(vxp1, precision = int_precision, debug = debuglx), self.precision.get_field_size() - 7, debug = debuglx), 0x7f, tag = "table_index", debug = debuglx) 

    # argument reduction
    # TODO: detect if single operand inverse seed is supported by the targeted architecture
    pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision = ML_UInt64), Constant(-2, precision = ML_UInt64), precision = ML_UInt64), precision = self.precision, tag = "pre_arg_red_index", debug = debug_lftolx)
    arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag = "arg_red_index", debug = debug_lftolx)

    red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index)
    #red_vxp1 = arg_red_index * vxp1 - 1.0
    red_vxp1.set_attributes(tag = "red_vxp1", debug = debug_lftolx)

    log_inv_lo = TableLoad(log_table, table_index, 1, tag = "log_inv_lo", debug = debug_lftolx) 
    log_inv_hi = TableLoad(log_table, table_index, 0, tag = "log_inv_hi", debug = debug_lftolx)

    inv_err = S2**-6 # TODO: link to target DivisionSeed precision

    Log.report(Log.Info, "building mathematical polynomial")
    approx_interval = Interval(-inv_err, inv_err)
    poly_degree = sup(guessdegree(log(1+sollya.x)/sollya.x, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1
    global_poly_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute)
    poly_object = global_poly_object.sub_poly(start_index = 1)

    Log.report(Log.Info, "generating polynomial evaluation scheme")
    _poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, red_vxp1, unified_precision = self.precision)
    _poly.set_attributes(tag = "poly", debug = debug_lftolx)
    Log.report(Log.Info, global_poly_object.get_sollya_object())


    vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag = "vxp1_inv_exp", debug = debugd)
    corr_exp = Conversion(-vxp1_exp + scaling_factor_exp, precision = self.precision)# vxp1_inv_exp

    #poly = (red_vxp1) * (1 +  _poly)
    #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True)

    pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo))
    pre_result.set_attributes(tag = "pre_result", debug = debug_lftolx)
    exact_log2_hi_exp = - corr_exp * log2_hi
    exact_log2_hi_exp.set_attributes(tag = "exact_log2_hi_exp", debug = debug_lftolx, prevent_optimization = True)
    #std_result =  exact_log2_hi_exp + pre_result

    exact_log2_lo_exp = - corr_exp * log2_lo
    exact_log2_lo_exp.set_attributes(tag = "exact_log2_lo_exp", debug = debug_lftolx)#, prevent_optimization = True)
    
    init = exact_log2_lo_exp  - log_inv_lo
    init.set_attributes(tag = "init", debug = debug_lftolx, prevent_optimization = True)
    fma0 = (red_vxp1 * _poly + init) # - log_inv_lo)
    fma0.set_attributes(tag = "fma0", debug = debug_lftolx)
    step0 = fma0 
    step0.set_attributes(tag = "step0", debug = debug_lftolx) #, prevent_optimization = True)
    step1 = step0 + red_vxp1
    step1.set_attributes(tag = "step1", debug = debug_lftolx, prevent_optimization = True)
    step2 = -log_inv_hi + step1
    step2.set_attributes(tag = "step2", debug = debug_lftolx, prevent_optimization = True)
    std_result = exact_log2_hi_exp + step2
    std_result.set_attributes(tag = "std_result", debug = debug_lftolx, prevent_optimization = True)


    # main scheme
    Log.report(Log.Info, "MDL scheme")
    pre_scheme = ConditionBlock(neg_input,
        Statement(
            ClearException(),
            Raise(ML_FPE_Invalid),
            Return(FP_QNaN(self.precision))
        ),
        ConditionBlock(vx_nan_or_inf,
            ConditionBlock(vx_inf,
                Statement(
                    ClearException(),
                    Return(FP_PlusInfty(self.precision)),
                ),
                Statement(
                    ClearException(),
                    ConditionBlock(vx_snan,
                        Raise(ML_FPE_Invalid)
                    ),
                    Return(FP_QNaN(self.precision))
                )
            ),
            ConditionBlock(vx_subnormal,
                Return(vx),
                ConditionBlock(ctz_cond,
                    Statement(
                        Return(ctz_result),
                    ),
                    Statement(
                        Return(std_result)
                    )
                )
            )
        )
    )
    scheme = pre_scheme
    return scheme