def generate_test_wrapper(self, tensor_descriptors, input_tables, output_tables): auto_test = CodeFunction("test_wrapper", output_format=ML_Int32) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True, require_header=["stdio.h"]) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) test_loop = self.get_tensor_test_wrapper( tested_function, tensor_descriptors, input_tables, output_tables, acc_num, self.generate_tensor_check_loop) # common test scheme between scalar and vector functions test_scheme = Statement(test_loop, printf_success_function(), Return(Constant(0, precision=ML_Int32))) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def externalize_call(self, optree, arg_list, tag = "foo", result_format = None): # determining return format return_format = optree.get_precision() if result_format is None else result_format assert(not return_format is None and "external call result format must be defined") # function_name = self.main_code_object.declare_free_function_name(tag) function_name = self.name_factory.declare_free_function_name(tag) ext_function = CodeFunction(function_name, output_format = return_format) # creating argument copy arg_map = {} arg_index = 0 for arg in arg_list: arg_tag = arg.get_tag(default = "arg_%d" % arg_index) arg_index += 1 arg_map[arg] = ext_function.add_input_variable(arg_tag, arg.get_precision()) # copying optree while swapping argument for variables optree_copy = optree.copy(copy_map = arg_map) # instanciating external function scheme if isinstance(optree, ML_ArithmeticOperation): function_optree = Statement(Return(optree_copy)) else: function_optree = Statement(optree_copy) ext_function.set_scheme(function_optree) self.name_factory.declare_function(function_name, ext_function.get_function_object()) return ext_function
def generate_function_from_optree(name_factory, optree, arg_list, tag="foo", result_format=None): """ Function which transform a sub-graph @p optree whose inputs are @p arg_list into a meta function @param optree operation graph to be incorporated as function boday @param arg_list list of @p optree's parameters to be used as function arguments @param name_factory engine to generate unique function name and to register function @param tag string to be used as seed to generate function name @param result_format hint to indicate function's return format (if optree is not an arithmetic operation (e.g. it already contains a Return node, then @p result_format must be used to specify the funciton return format) @return CodeFunction object containing the function implementation (plus the function would have been declared into name_factory) """ # determining return format return_format = optree.get_precision( ) if result_format is None else result_format assert (not return_format is None and "external call result format must be defined") function_name = name_factory.declare_free_function_name(tag) ext_function = CodeFunction(function_name, output_format=return_format) # creating argument copy arg_map = {} arg_index = 0 for arg in arg_list: arg_tag = arg.get_tag(default="arg_%d" % arg_index) arg_index += 1 arg_map[arg] = ext_function.add_input_variable(arg_tag, arg.get_precision()) # extracting const table to make sure then are not duplicated table_set = extract_tables(optree) arg_map.update({table: table for table in table_set if table.const}) # copying optree while swapping argument for variables optree_copy = optree.copy(copy_map=arg_map) # instanciating external function scheme if isinstance(optree, ML_ArithmeticOperation): function_optree = Statement(Return(optree_copy)) else: function_optree = Statement(optree_copy) ext_function.set_scheme(function_optree) name_factory.declare_function(function_name, ext_function.get_function_object()) return ext_function
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log1pf.c", function_name="log1pf"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round( log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_Table(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in xrange(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = (1.0 + (inv_approx_table[i][0] / S2**9)) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup( guessdegree( log1p(sollya.x) / sollya.x, ctz_interval, S2** -(self.precision.get_field_size() + 1))) + 1 ctz_poly_object = Polynomial.build_from_approximation( log1p(sollya.x) / sollya.x, ctz_poly_degree, [self.precision] * (ctz_poly_degree + 1), ctz_interval, sollya.absolute) print "generating polynomial evaluation scheme" ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme( ctz_poly_object, vx, unified_precision=self.precision) ctz_poly.set_attributes(tag="ctz_poly", debug=debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") log_function_code = CodeFunction( "new_log", [Variable("x", precision=ML_Binary64)], output_format=ML_Binary64) log_call_generator = FunctionOperator( log_function_code.get_name(), arity=1, output_precision=ML_Binary64, declare_prototype=log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64, ), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag="pre_vxp1", debug=debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag="pre_vxp1_exp", debug=debugd) cm500 = Constant(-500, precision=ML_Int32) c0 = Constant(0, precision=ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size() - 2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision=self.precision, tag="scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag="vxp1", debug=debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag="vxp1_exp", debug=debugd) vxp1_inv = DivisionSeed(vxp1, precision=self.precision, tag="vxp1_inv", debug=debug_lftolx, silent=True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision=self.precision, tag="vxp1_dirty_inv", debug=debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(vxp1, precision=int_precision, debug=debuglx), self.precision.get_field_size() - 7, debug=debuglx), 0x7f, tag="table_index", debug=debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag="red_vxp1", debug=debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision print "building mathematical polynomial" approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) print "generating polynomial evaluation scheme" _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vxp1, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print global_poly_object.get_sollya_object() vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag="vxp1_inv_exp", debug=debugd) corr_exp = -vxp1_exp + scaling_factor_exp # vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = -corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_lftolx, prevent_optimization=True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = -corr_exp * log2_lo exact_log2_lo_exp.set_attributes( tag="exact_log2_lo_exp", debug=debug_lftolx) #, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag="init", debug=debug_lftolx, prevent_optimization=True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag="fma0", debug=debug_lftolx) step0 = fma0 step0.set_attributes( tag="step0", debug=debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag="step1", debug=debug_lftolx, prevent_optimization=True) step2 = -log_inv_hi + step1 step2.set_attributes(tag="step2", debug=debug_lftolx, prevent_optimization=True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag="std_result", debug=debug_lftolx, prevent_optimization=True) # main scheme print "MDL scheme" pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement(Return(ctz_result), ), Statement(Return(std_result)))))) scheme = pre_scheme #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def __init__(self, precision=ML_Binary64, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log_fixed.c", function_name="log_fixed"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) vx_exp = RawSignExpExtraction(vx, tag="vx_exp", precision=ML_Int32, debug=debugd) vx_exp_u = Conversion(vx_exp, precision=ML_UInt32) vx_exp_u.set_precision(ML_UInt32) tt = CountLeadingZeros(vx_exp_u) tt_u = Conversion(tt, precision=ML_UInt32) t = tt_u + vx_exp_u scheme = Statement(Return(t)) #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) #print scheme.get_str(depth = None, display_precision = True) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="sinf.c", function_name="sinf"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(ML_Binary32))) int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32 inv_pi_value = 1 / pi # argument reduction mod_pi_x = NearestInteger(vx * inv_pi_value) red_vx = vx - mod_pi_x * pi approx_interval = Interval(0, pi / 2) poly_degree = sup( guessdegree( sin(sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( sin(sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object #.sub_poly(start_index = 1) print "generating polynomial evaluation scheme" _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vx, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print global_poly_object.get_sollya_object() pre_result = vx * _poly result = pre_result result.set_attributes(tag="result", debug=debug_lftolx) # main scheme print "MDL scheme" scheme = Statement(Return(result)) #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def generate_bench_wrapper(self, test_num=1, loop_num=100000, test_ranges=[Interval(-1.0, 1.0)], debug=False): # interval where the array lenght is chosen from (randomly) index_range = self.test_index_range auto_test = CodeFunction("bench_wrapper", output_format=ML_Binary64) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) output_precision = FormatAttributeWrapper(self.precision, ["volatile"]) test_total = test_num # number of arrays expected as inputs for tested_function NUM_INPUT_ARRAY = 1 # position of the input array in tested_function operands (generally # equals to 1 as to 0-th input is often the destination array) INPUT_INDEX_OFFSET = 1 # concatenating standard test array at the beginning of randomly # generated array TABLE_SIZE_VALUES = [ len(std_table) for std_table in self.standard_test_cases ] + [ random.randrange(index_range[0], index_range[1] + 1) for i in range(test_num) ] OFFSET_VALUES = [sum(TABLE_SIZE_VALUES[:i]) for i in range(test_total)] table_size_offset_array = generate_2d_table( test_total, 2, ML_UInt32, self.uniquify_name("table_size_array"), value_gen=(lambda row_id: (TABLE_SIZE_VALUES[row_id], OFFSET_VALUES[row_id]))) INPUT_ARRAY_SIZE = sum(TABLE_SIZE_VALUES) # TODO/FIXME: implement proper input range depending on input index # assuming a single input array input_precisions = [self.get_input_precision(1).get_data_precision()] rng_map = [ get_precision_rng(precision, inf(test_range), sup(test_range)) for precision, test_range in zip(input_precisions, test_ranges) ] # generated table of inputs input_tables = [ generate_1d_table( INPUT_ARRAY_SIZE, self.get_input_precision(INPUT_INDEX_OFFSET + table_id).get_data_precision(), self.uniquify_name("input_table_arg%d" % table_id), value_gen=( lambda _: input_precisions[table_id].round_sollya_object( rng_map[table_id].get_new_value(), sollya.RN))) for table_id in range(NUM_INPUT_ARRAY) ] # generate output_array output_array = generate_1d_table( INPUT_ARRAY_SIZE, output_precision, self.uniquify_name("output_array"), #value_gen=(lambda _: FP_QNaN(self.precision)) value_gen=(lambda _: None), const=False, empty=True) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) def empty_post_statement_gen(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): return Statement() test_loop = self.get_array_test_wrapper(test_total, tested_function, table_size_offset_array, input_tables, output_array, acc_num, empty_post_statement_gen) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s %%\"PRIi64\" elts computed in %%\"PRIi64\" nanoseconds => %%.3f CPE \\n\"" % function_name, 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2) }, void_function=True) printf_timing_function = FunctionObject( "printf", [ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) vj = Variable("j", precision=ML_Int32, var_type=Variable.Local) loop_num_cst = Constant(loop_num, precision=ML_Int32, tag="loop_num") loop_increment = 1 # bench measure of clock per element cpe_measure = Division( Conversion(timer, precision=ML_Binary64), Conversion(acc_num, precision=ML_Binary64), precision=ML_Binary64, tag="cpe_measure", ) # common test scheme between scalar and vector functions test_scheme = Statement( self.processor.get_init_timestamp(), ReferenceAssign(timer, self.processor.get_current_timestamp()), ReferenceAssign(acc_num, 0), Loop( ReferenceAssign(vj, Constant(0, precision=ML_Int32)), vj < loop_num_cst, Statement(test_loop, ReferenceAssign(vj, vj + loop_increment))), ReferenceAssign( timer, Subtraction(self.processor.get_current_timestamp(), timer, precision=ML_Int64)), printf_timing_function( Conversion(acc_num, precision=ML_Int64), timer, cpe_measure, ), Return(cpe_measure), # Return(Constant(0, precision = ML_Int32)) ) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, num_iter=3, fast_path_extract=True, target=GenericProcessor(), output_file="__divsf3.c", function_name="__divsf3"): # declaring CodeFunction and retrieving input variable self.precision = precision self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format=precision) vx = exp_implementation.add_input_variable("x", precision) vy = exp_implementation.add_input_variable("y", precision) processor = target class NR_Iteration(object): def __init__(self, approx, divisor, force_fma=False): self.approx = approx self.divisor = divisor self.force_fma = force_fma if force_fma: self.error = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) self.new_approx = FusedMultiplyAdd( self.error, self.approx, self.approx, specifier=FusedMultiplyAdd.Standard) else: self.error = 1 - divisor * approx self.new_approx = self.approx + self.error * self.approx def get_new_approx(self): return self.new_approx def get_hint_rules(self, gcg, gappa_code, exact): divisor = self.divisor.get_handle().get_node() approx = self.approx.get_handle().get_node() new_approx = self.new_approx.get_handle().get_node() Attributes.set_default_precision(ML_Exact) if self.force_fma: rule0 = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) else: rule0 = 1.0 - divisor * approx rule1 = 1.0 - divisor * (approx - exact) - 1.0 rule2 = new_approx - exact subrule = approx * (2 - divisor * approx) rule3 = (new_approx - subrule ) - (approx - exact) * (approx - exact) * divisor if self.force_fma: new_error = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) rule4 = FusedMultiplyAdd(new_error, approx, approx) else: rule4 = approx + (1 - divisor * approx) * approx Attributes.unset_default_precision() # registering hints gcg.add_hint(gappa_code, rule0, rule1) gcg.add_hint(gappa_code, rule2, rule3) gcg.add_hint(gappa_code, subrule, rule4) debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%lx") debugd = ML_Debug(display_format="%d") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) ex = Max(Min(ExponentExtraction(vx), 1020), -1020, tag="ex", debug=debugd) ey = Max(Min(ExponentExtraction(vy), 1020), -1020, tag="ey", debug=debugd) exact_ex = ExponentExtraction(vx, tag="exact_ex") exact_ey = ExponentExtraction(vy, tag="exact_ey") Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) # computing the inverse square root init_approx = None scaling_factor_x = ExponentInsertion(-ex, tag="sfx_ei") scaling_factor_y = ExponentInsertion(-ey, tag="sfy_ei") scaled_vx = vx * scaling_factor_x scaled_vy = vy * scaling_factor_y scaled_vx.set_attributes(debug=debug_lftolx, tag="scaled_vx") scaled_vy.set_attributes(debug=debug_lftolx, tag="scaled_vy") scaled_vx.set_precision(ML_Binary64) scaled_vy.set_precision(ML_Binary64) # forcing vx precision to make processor support test init_approx_precision = DivisionSeed(scaled_vx, scaled_vy, precision=self.precision, tag="seed", debug=debug_lftolx) if not processor.is_supported_operation(init_approx_precision): if self.precision != ML_Binary32: px = Conversion( scaled_vx, precision=ML_Binary32, tag="px", debug=debugf) if self.precision != ML_Binary32 else vx py = Conversion( scaled_vy, precision=ML_Binary32, tag="py", debug=debugf) if self.precision != ML_Binary32 else vy init_approx_fp32 = Conversion(DivisionSeed( px, py, precision=ML_Binary32, tag="seed", debug=debugf), precision=self.precision, tag="seed_ext", debug=debug_lftolx) if not processor.is_supported_operation(init_approx_fp32): Log.report( Log.Error, "The target %s does not implement inverse square root seed" % processor) else: init_approx = init_approx_fp32 else: Log.report( Log.Error, "The target %s does not implement inverse square root seed" % processor) else: init_approx = init_approx_precision current_approx_std = init_approx # correctly-rounded inverse computation num_iteration = num_iter Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() def compute_div(_init_approx, _vx=None, _vy=None, scale_result=None): inv_iteration_list = [] Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) _current_approx = _init_approx for i in range(num_iteration): new_iteration = NR_Iteration( _current_approx, _vy, force_fma=False if (i != num_iteration - 1) else True) inv_iteration_list.append(new_iteration) _current_approx = new_iteration.get_new_approx() _current_approx.set_attributes(tag="iter_%d" % i, debug=debug_lftolx) def dividend_mult(div_approx, inv_approx, dividend, divisor, index, force_fma=False): #yerr = dividend - div_approx * divisor yerr = FMSN(div_approx, divisor, dividend) yerr.set_attributes(tag="yerr%d" % index, debug=debug_lftolx) #new_div = div_approx + yerr * inv_approx new_div = FMA(yerr, inv_approx, div_approx) new_div.set_attributes(tag="new_div%d" % index, debug=debug_lftolx) return new_div # multiplication correction iteration # to get correctly rounded full division _current_approx.set_attributes(tag="final_approx", debug=debug_lftolx) current_div_approx = _vx * _current_approx num_dividend_mult_iteration = 1 for i in range(num_dividend_mult_iteration): current_div_approx = dividend_mult(current_div_approx, _current_approx, _vx, _vy, i) # last iteration yerr_last = FMSN(current_div_approx, _vy, _vx) #, clearprevious = True) Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() last_div_approx = FMA(yerr_last, _current_approx, current_div_approx, rounding_mode=ML_GlobalRoundMode) yerr_last.set_attributes(tag="yerr_last", debug=debug_lftolx) pre_result = last_div_approx pre_result.set_attributes(tag="unscaled_div_result", debug=debug_lftolx) if scale_result != None: #result = pre_result * ExponentInsertion(ex) * ExponentInsertion(-ey) scale_factor_0 = Max(Min(scale_result, 950), -950, tag="scale_factor_0", debug=debugd) scale_factor_1 = Max(Min(scale_result - scale_factor_0, 950), -950, tag="scale_factor_1", debug=debugd) scale_factor_2 = scale_result - (scale_factor_1 + scale_factor_0) scale_factor_2.set_attributes(debug=debugd, tag="scale_factor_2") result = ((pre_result * ExponentInsertion(scale_factor_0)) * ExponentInsertion(scale_factor_1) ) * ExponentInsertion(scale_factor_2) else: result = pre_result result.set_attributes(tag="result", debug=debug_lftolx) ext_pre_result = FMA(yerr_last, _current_approx, current_div_approx, precision=ML_DoubleDouble, tag="ext_pre_result", debug=debug_ddtolx) subnormal_pre_result = SpecificOperation( ext_pre_result, ex - ey, precision=self.precision, specifier=SpecificOperation.Subnormalize, tag="subnormal_pre_result", debug=debug_lftolx) sub_scale_factor = ex - ey sub_scale_factor_0 = Max(Min(sub_scale_factor, 950), -950, tag="sub_scale_factor_0", debug=debugd) sub_scale_factor_1 = Max(Min(sub_scale_factor - sub_scale_factor_0, 950), -950, tag="sub_scale_factor_1", debug=debugd) sub_scale_factor_2 = sub_scale_factor - (sub_scale_factor_1 + sub_scale_factor_0) sub_scale_factor_2.set_attributes(debug=debugd, tag="sub_scale_factor_2") #subnormal_result = (subnormal_pre_result * ExponentInsertion(ex, tag ="sr_ex_ei")) * ExponentInsertion(-ey, tag = "sr_ey_ei") subnormal_result = ( subnormal_pre_result * ExponentInsertion(sub_scale_factor_0)) * ExponentInsertion( sub_scale_factor_1, tag="sr_ey_ei") * ExponentInsertion(sub_scale_factor_2) subnormal_result.set_attributes(debug=debug_lftolx, tag="subnormal_result") return result, subnormal_result, _current_approx, inv_iteration_list def bit_match(fp_optree, bit_id, likely=False, **kwords): return NotEqual(BitLogicAnd( TypeCast(fp_optree, precision=ML_Int64), 1 << bit_id), 0, likely=likely, **kwords) def extract_and_inject_sign(sign_source, sign_dest, int_precision=ML_Int64, fp_precision=self.precision, **kwords): int_sign_dest = sign_dest if isinstance( sign_dest.get_precision(), ML_Fixed_Format) else TypeCast( sign_dest, precision=int_precision) return TypeCast(BitLogicOr( BitLogicAnd(TypeCast(sign_source, precision=int_precision), 1 << (self.precision.bit_size - 1)), int_sign_dest), precision=fp_precision) x_zero = Test(vx, specifier=Test.IsZero, likely=False) y_zero = Test(vy, specifier=Test.IsZero, likely=False) comp_sign = Test(vx, vy, specifier=Test.CompSign, tag="comp_sign", debug=debuglx) y_nan = Test(vy, specifier=Test.IsNaN, likely=False) x_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False) y_snan = Test(vy, specifier=Test.IsSignalingNaN, likely=False) x_inf = Test(vx, specifier=Test.IsInfty, likely=False, tag="x_inf") y_inf = Test(vy, specifier=Test.IsInfty, likely=False, tag="y_inf", debug=debugd) scheme = None gappa_vx, gappa_vy = None, None gappa_init_approx = None gappa_current_approx = None if isinstance(processor, K1B_Processor): print "K1B specific generation" gappa_vx = vx gappa_vy = vy fast_init_approx = DivisionSeed(vx, vy, precision=self.precision, tag="fast_init_approx", debug=debug_lftolx) slow_init_approx = DivisionSeed(scaled_vx, scaled_vy, precision=self.precision, tag="slow_init_approx", debug=debug_lftolx) gappa_init_approx = fast_init_approx specific_case = bit_match(fast_init_approx, 0, tag="b0_specific_case_bit", debug=debugd) y_subnormal_or_zero = bit_match(fast_init_approx, 1, tag="b1_y_sub_or_zero", debug=debugd) x_subnormal_or_zero = bit_match(fast_init_approx, 2, tag="b2_x_sub_or_zero", debug=debugd) y_inf_or_nan = bit_match(fast_init_approx, 3, tag="b3_y_inf_or_nan", debug=debugd) inv_underflow = bit_match(fast_init_approx, 4, tag="b4_inv_underflow", debug=debugd) x_inf_or_nan = bit_match(fast_init_approx, 5, tag="b5_x_inf_or_nan", debug=debugd) mult_error_underflow = bit_match(fast_init_approx, 6, tag="b6_mult_error_underflow", debug=debugd) mult_dividend_underflow = bit_match( fast_init_approx, 7, tag="b7_mult_dividend_underflow", debug=debugd) mult_dividend_overflow = bit_match(fast_init_approx, 8, tag="b8_mult_dividend_overflow", debug=debugd) direct_result_flag = bit_match(fast_init_approx, 9, tag="b9_direct_result_flag", debug=debugd) div_overflow = bit_match(fast_init_approx, 10, tag="b10_div_overflow", debug=debugd) # bit11/eb large = bit_match(fast_init_approx, 11) # bit12 = bit_match(fast_init_approx, 11) #slow_result, slow_result_subnormal, _, _ = compute_div(slow_init_approx, scaled_vx, scaled_vy, scale_result = (ExponentInsertion(ex, tag = "eiy_sr"), ExponentInsertion(-ey, tag ="eiy_sr"))) slow_result, slow_result_subnormal, _, _ = compute_div( slow_init_approx, scaled_vx, scaled_vy, scale_result=ex - ey) fast_result, fast_result_subnormal, fast_current_approx, inv_iteration_list = compute_div( fast_init_approx, vx, vy, scale_result=None) gappa_current_approx = fast_current_approx pre_scheme = ConditionBlock( NotEqual(specific_case, 0, tag="specific_case", likely=True, debug=debugd), Return(fast_result), ConditionBlock( Equal(direct_result_flag, 0, tag="direct_result_case"), Return(fast_init_approx), ConditionBlock( x_subnormal_or_zero | y_subnormal_or_zero | inv_underflow | mult_error_underflow | mult_dividend_overflow | mult_dividend_underflow, ConditionBlock( x_zero | y_zero, Return(fast_init_approx), ConditionBlock( Test(slow_result, specifier=Test.IsSubnormal), Return(slow_result_subnormal), Return(slow_result)), ), ConditionBlock( x_inf_or_nan, Return(fast_init_approx), ConditionBlock( y_inf_or_nan, Return(fast_init_approx), ConditionBlock( NotEqual(div_overflow, 0, tag="div_overflow_case"), Return( RoundedSignedOverflow( fast_init_approx, tag="signed_inf")), #Return(extract_and_inject_sign(fast_init_approx, FP_PlusInfty(self.precision) , tag = "signed_inf")), Return(FP_SNaN(self.precision)))))))) scheme = Statement(fast_result, pre_scheme) else: print "generic generation" x_inf_or_nan = Test(vx, specifier=Test.IsInfOrNaN, likely=False) y_inf_or_nan = Test(vy, specifier=Test.IsInfOrNaN, likely=False, tag="y_inf_or_nan", debug=debugd) result, subnormal_result, gappa_current_approx, inv_iteration_list = compute_div( current_approx_std, scaled_vx, scaled_vy, scale_result=(ExponentInsertion(ex), ExponentInsertion(-ey))) gappa_vx = scaled_vx gappa_vy = scaled_vy gappa_init_approx = init_approx # x inf and y inf pre_scheme = ConditionBlock( x_inf_or_nan, ConditionBlock( x_inf, ConditionBlock( y_inf_or_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)), ), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), Statement(ConditionBlock(x_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( x_zero, ConditionBlock( y_zero | y_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision))), Return(vx)), ConditionBlock( y_inf_or_nan, ConditionBlock( y_inf, Return( Select(comp_sign, FP_MinusZero(self.precision), FP_PlusZero(self.precision))), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( y_zero, Statement( Raise(ML_FPE_DivideByZero), ConditionBlock( comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), ConditionBlock( Test(result, specifier=Test.IsSubnormal, likely=False), Statement( ConditionBlock( Comparison( yerr_last, 0, specifier=Comparison.NotEqual, likely=True), Statement( Raise(ML_FPE_Inexact, ML_FPE_Underflow))), Return(subnormal_result), ), Statement( ConditionBlock( Comparison( yerr_last, 0, specifier=Comparison.NotEqual, likely=True), Raise(ML_FPE_Inexact)), Return(result))))))) rnd_mode = GetRndMode() scheme = Statement(rnd_mode, SetRndMode(ML_RoundToNearest), yerr_last, SetRndMode(rnd_mode), pre_result, ClearException(), result, pre_scheme) opt_eng = OptimizationEngine(processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) #print "silencing operation" #opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) #print scheme.get_str(depth = None, display_precision = True) # check processor support print "checking processor support" opt_eng.check_processor_support(scheme) # factorizing fast path #opt_eng.factorize_fast_path(scheme) print "Gappa script generation" cg = CCodeGenerator(processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") self.result.add_header("support_lib/ml_special_values.h") output_stream = open(output_file, "w") output_stream.write(self.result.get(cg)) output_stream.close() seed_var = Variable("seed", precision=self.precision, interval=Interval(0.5, 1)) cg_eval_error_copy_map = { gappa_init_approx.get_handle().get_node(): seed_var, gappa_vx.get_handle().get_node(): Variable("x", precision=self.precision, interval=Interval(1, 2)), gappa_vy.get_handle().get_node(): Variable("y", precision=self.precision, interval=Interval(1, 2)), } G1 = Constant(1, precision=ML_Exact) exact = G1 / gappa_vy exact.set_precision(ML_Exact) exact.set_tag("div_exact") gappa_goal = gappa_current_approx.get_handle().get_node() - exact gappa_goal.set_precision(ML_Exact) gappacg = GappaCodeGenerator(target, declare_cst=False, disable_debug=True) gappa_code = gappacg.get_interval_code(gappa_goal, cg_eval_error_copy_map) new_exact_node = exact.get_handle().get_node() for nr in inv_iteration_list: nr.get_hint_rules(gappacg, gappa_code, new_exact_node) seed_wrt_exact = seed_var - new_exact_node seed_wrt_exact.set_precision(ML_Exact) gappacg.add_hypothesis(gappa_code, seed_wrt_exact, Interval(-S2**-7, S2**-7)) try: eval_error = execute_gappa_script_extract( gappa_code.get(gappacg))["goal"] print "eval_error: ", eval_error except: print "error during gappa run"
def __init__(self, precision = ML_Binary32, abs_accuracy = S2**-24, libm_compliant = True, debug_flag = False, fuse_fma = True, num_iter = 3, fast_path_extract = True, target = GenericProcessor(), output_file = "__divsf3.c", function_name = "__divsf3"): # declaring CodeFunction and retrieving input variable self.precision = precision self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format = precision) vx = exp_implementation.add_input_variable("x", precision) vy = exp_implementation.add_input_variable("y", precision) class NR_Iteration(object): def __init__(self, approx, divisor, force_fma = False): self.approx = approx self.divisor = divisor self.force_fma = force_fma if force_fma: self.error = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) self.new_approx = FusedMultiplyAdd(self.error, self.approx, self.approx, specifier = FusedMultiplyAdd.Standard) else: self.error = 1 - divisor * approx self.new_approx = self.approx + self.error * self.approx def get_new_approx(self): return self.new_approx def get_hint_rules(self, gcg, gappa_code, exact): divisor = self.divisor.get_handle().get_node() approx = self.approx.get_handle().get_node() new_approx = self.new_approx.get_handle().get_node() Attributes.set_default_precision(ML_Exact) if self.force_fma: rule0 = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) else: rule0 = 1.0 - divisor * approx rule1 = 1.0 - divisor * (approx - exact) - 1.0 rule2 = new_approx - exact subrule = approx * (2 - divisor * approx) rule3 = (new_approx - subrule) - (approx - exact) * (approx - exact) * divisor if self.force_fma: new_error = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) rule4 = FusedMultiplyAdd(new_error, approx, approx) else: rule4 = approx + (1 - divisor * approx) * approx Attributes.unset_default_precision() # registering hints gcg.add_hint(gappa_code, rule0, rule1) gcg.add_hint(gappa_code, rule2, rule3) gcg.add_hint(gappa_code, subrule, rule4) debugf = ML_Debug(display_format = "%f") debuglf = ML_Debug(display_format = "%lf") debugx = ML_Debug(display_format = "%x") debuglx = ML_Debug(display_format = "%lx") debugd = ML_Debug(display_format = "%d") debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_ddtolx = ML_Debug(display_format = "%\"PRIx64\" %\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format = "{.hi=%lf, .lo=%lf}", pre_process = lambda v: "%s.hi, %s.lo" % (v, v)) ex = Min(ExponentExtraction(vx, tag = "ex", debug = debugd), 1020) ey = Min(ExponentExtraction(vy, tag = "ey", debug = debugd), 1020) scaling_factor_x = ExponentInsertion(-ex) #ConditionalAllocation(Abs(ex) > 100, -ex, 0) scaling_factor_y = ExponentInsertion(-ey) #ConditionalAllocation(Abs(ey) > 100, -ey, 0) scaled_vx = vx * scaling_factor_x scaled_vy = vy * scaling_factor_y scaled_vx.set_attributes(debug = debug_lftolx, tag = "scaled_vx") scaled_vy.set_attributes(debug = debug_lftolx, tag = "scaled_vy") px = Conversion(scaled_vx, precision = ML_Binary32, tag = "px", debug=debugf) if self.precision != ML_Binary32 else vx py = Conversion(scaled_vy, precision = ML_Binary32, tag = "py", debug=debugf) if self.precision != ML_Binary32 else vy pre_init_approx = DivisionSeed(px, py, precision = ML_Binary32, tag = "seed", debug = debugf) init_approx = Conversion(pre_init_approx, precision = self.precision, tag = "seedd", debug = debug_lftolx) if self.precision != ML_Binary32 else pre_init_approx current_approx = init_approx # correctly-rounded inverse computation num_iteration = num_iter inv_iteration_list = [] Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) for i in range(num_iteration): new_iteration = NR_Iteration(current_approx, scaled_vy, force_fma = False if (i != num_iteration - 1) else True) inv_iteration_list.append(new_iteration) current_approx = new_iteration.get_new_approx() current_approx.set_attributes(tag = "iter_%d" % i, debug = debug_lftolx) def dividend_mult(div_approx, inv_approx, dividend, divisor, index, force_fma = False): yerr = dividend - div_approx * divisor #yerr = FMSN(div_approx, divisor, dividend) yerr.set_attributes(tag = "yerr%d" % index, debug = debug_lftolx) new_div = div_approx + yerr * inv_approx #new_div = FMA(yerr, inv_approx, div_approx) new_div.set_attributes(tag = "new_div%d" % index, debug = debug_lftolx) return new_div # multiplication correction iteration # to get correctly rounded full division current_approx.set_attributes(tag = "final_approx", debug = debug_lftolx) current_div_approx = scaled_vx * current_approx num_dividend_mult_iteration = 1 for i in range(num_dividend_mult_iteration): current_div_approx = dividend_mult(current_div_approx, current_approx, scaled_vx, scaled_vy, i) # last iteration yerr_last = FMSN(current_div_approx, scaled_vy, scaled_vx) #, clearprevious = True) Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() last_div_approx = FMA(yerr_last, current_approx, current_div_approx) yerr_last.set_attributes(tag = "yerr_last", debug = debug_lftolx) pre_result = last_div_approx pre_result.set_attributes(tag = "unscaled_div_result", debug = debug_lftolx) result = pre_result * ExponentInsertion(ex) * ExponentInsertion(-ey) result.set_attributes(tag = "result", debug = debug_lftolx) x_inf_or_nan = Test(vx, specifier = Test.IsInfOrNaN, likely = False) y_inf_or_nan = Test(vy, specifier = Test.IsInfOrNaN, likely = False, tag = "y_inf_or_nan", debug = debugd) comp_sign = Test(vx, vy, specifier = Test.CompSign, tag = "comp_sign", debug = debuglx ) x_zero = Test(vx, specifier = Test.IsZero, likely = False) y_zero = Test(vy, specifier = Test.IsZero, likely = False) y_nan = Test(vy, specifier = Test.IsNaN, likely = False) x_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False) y_snan = Test(vy, specifier = Test.IsSignalingNaN, likely = False) x_inf = Test(vx, specifier = Test.IsInfty, likely = False, tag = "x_inf") y_inf = Test(vy, specifier = Test.IsInfty, likely = False, tag = "y_inf", debug = debugd) # determining an extended precision ext_precision_map = { ML_Binary32: ML_Binary64, ML_Binary64: ML_DoubleDouble, } ext_precision = ext_precision_map[self.precision] ext_pre_result = FMA(yerr_last, current_approx, current_div_approx, precision = ext_precision, tag = "ext_pre_result", debug = debug_ddtolx) subnormal_result = None if isinstance(ext_precision, ML_Compound_FP_Format): subnormal_pre_result = SpecificOperation(ext_pre_result, ex - ey, precision = self.precision, specifier = SpecificOperation.Subnormalize, tag = "subnormal_pre_result", debug = debug_lftolx) subnormal_result = (subnormal_pre_result * ExponentInsertion(ex)) * ExponentInsertion(-ey) else: subnormal_result = Conversion(ext_pre_result * ExponentInsertion(ex - ey, tag = "final_scaling_factor", precision = ext_precision), precision = self.precision) # x inf and y inf pre_scheme = ConditionBlock(x_inf_or_nan, ConditionBlock(x_inf, ConditionBlock(y_inf_or_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)), ), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision))) ), Statement( ConditionBlock(x_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(x_zero, ConditionBlock(y_zero | y_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ), Return(vx) ), ConditionBlock(y_inf_or_nan, ConditionBlock(y_inf, Return(Select(comp_sign, FP_MinusZero(self.precision), FP_PlusZero(self.precision))), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(y_zero, Statement( Raise(ML_FPE_DivideByZero), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)) ) ), ConditionBlock(Test(result, specifier = Test.IsSubnormal, likely = False), Statement( ConditionBlock(Comparison(yerr_last, 0, specifier = Comparison.NotEqual, likely = True), Statement(Raise(ML_FPE_Inexact, ML_FPE_Underflow)) ), Return(subnormal_result), ), Statement( ConditionBlock(Comparison(yerr_last, 0, specifier = Comparison.NotEqual, likely = True), Raise(ML_FPE_Inexact) ), Return(result) ) ) ) ) ) ) rnd_mode = GetRndMode() scheme = Statement(rnd_mode, SetRndMode(ML_RoundToNearest), yerr_last, SetRndMode(rnd_mode), pre_result, ClearException(), result, pre_scheme) processor = target opt_eng = OptimizationEngine(processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence = True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision = self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) #print "silencing operation" #opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) #print scheme.get_str(depth = None, display_precision = True) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path #opt_eng.factorize_fast_path(scheme) cg = CCodeGenerator(processor, declare_cst = False, disable_debug = not debug_flag, libm_compliant = libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst = True) self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") self.result.add_header("support_lib/ml_special_values.h") output_stream = open(output_file, "w") output_stream.write(self.result.get(cg)) output_stream.close() seed_var = Variable("seed", precision = self.precision, interval = Interval(0.5, 1)) cg_eval_error_copy_map = { init_approx.get_handle().get_node(): seed_var, scaled_vx.get_handle().get_node(): Variable("x", precision = self.precision, interval = Interval(1, 2)), scaled_vy.get_handle().get_node(): Variable("y", precision = self.precision, interval = Interval(1, 2)), } G1 = Constant(1, precision = ML_Exact) exact = G1 / scaled_vy exact.set_precision(ML_Exact) exact.set_tag("div_exact") gappa_goal = current_approx.get_handle().get_node() - exact gappa_goal.set_precision(ML_Exact) gappacg = GappaCodeGenerator(target, declare_cst = False, disable_debug = True) gappa_code = gappacg.get_interval_code(gappa_goal, cg_eval_error_copy_map) new_exact_node = exact.get_handle().get_node() for nr in inv_iteration_list: nr.get_hint_rules(gappacg, gappa_code, new_exact_node) seed_wrt_exact = seed_var - new_exact_node seed_wrt_exact.set_precision(ML_Exact) gappacg.add_hypothesis(gappa_code, seed_wrt_exact, Interval(-S2**-7, S2**-7)) eval_error = execute_gappa_script_extract(gappa_code.get(gappacg))["goal"] print "eval_error: ", eval_error
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.get_input_precision().sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision = self.precision) log2_lo = Constant(log2_lo_value, precision = self.precision) vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = inv_approx_table[i] # (1.0 + (inv_approx_table[i] / S2**9) ) * S2**-1 value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup(guessdegree(log1p(sollya.x)/sollya.x, ctz_interval, S2**-(self.precision.get_field_size()+1))) + 1 ctz_poly_object = Polynomial.build_from_approximation(log1p(sollya.x)/sollya.x, ctz_poly_degree, [self.precision]*(ctz_poly_degree+1), ctz_interval, sollya.absolute) Log.report(Log.Info, "generating polynomial evaluation scheme") ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme(ctz_poly_object, vx, unified_precision = self.precision) ctz_poly.set_attributes(tag = "ctz_poly", debug = debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely = False, specifier = Comparison.Less, debug = debugd, tag = "neg_input") vx_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = debugd, tag = "nan_or_inf") vx_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False, debug = debugd, tag = "snan") vx_inf = Test(vx, specifier = Test.IsInfty, likely = False, debug = debugd, tag = "inf") vx_subnormal = Test(vx, specifier = Test.IsSubnormal, likely = False, debug = debugd, tag = "vx_subnormal") log_function_code = CodeFunction("new_log", [Variable("x", precision = ML_Binary64)], output_format = ML_Binary64) log_call_generator = FunctionOperator(log_function_code.get_name(), arity = 1, output_precision = ML_Binary64, declare_prototype = log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64,), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag = "pre_vxp1", debug = debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag = "pre_vxp1_exp", debug = debugd) cm500 = Constant(-500, precision = ML_Int32) c0 = Constant(0, precision = ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size()-2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision = self.precision, tag = "scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag = "vxp1", debug = debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag = "vxp1_exp", debug = debugd) vxp1_inv = ReciprocalSeed(vxp1, precision = self.precision, tag = "vxp1_inv", debug = debug_lftolx, silent = True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision = self.precision, tag = "vxp1_dirty_inv", debug = debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift(TypeCast(vxp1, precision = int_precision, debug = debuglx), self.precision.get_field_size() - 7, debug = debuglx), 0x7f, tag = "table_index", debug = debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision = ML_UInt64), Constant(-2, precision = ML_UInt64), precision = ML_UInt64), precision = self.precision, tag = "pre_arg_red_index", debug = debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag = "arg_red_index", debug = debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag = "red_vxp1", debug = debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag = "log_inv_lo", debug = debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag = "log_inv_hi", debug = debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup(guessdegree(log(1+sollya.x)/sollya.x, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1 global_poly_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index = 1) Log.report(Log.Info, "generating polynomial evaluation scheme") _poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, red_vxp1, unified_precision = self.precision) _poly.set_attributes(tag = "poly", debug = debug_lftolx) Log.report(Log.Info, global_poly_object.get_sollya_object()) vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag = "vxp1_inv_exp", debug = debugd) corr_exp = Conversion(-vxp1_exp + scaling_factor_exp, precision = self.precision)# vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag = "pre_result", debug = debug_lftolx) exact_log2_hi_exp = - corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag = "exact_log2_hi_exp", debug = debug_lftolx, prevent_optimization = True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = - corr_exp * log2_lo exact_log2_lo_exp.set_attributes(tag = "exact_log2_lo_exp", debug = debug_lftolx)#, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag = "init", debug = debug_lftolx, prevent_optimization = True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag = "fma0", debug = debug_lftolx) step0 = fma0 step0.set_attributes(tag = "step0", debug = debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag = "step1", debug = debug_lftolx, prevent_optimization = True) step2 = -log_inv_hi + step1 step2.set_attributes(tag = "step2", debug = debug_lftolx, prevent_optimization = True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag = "std_result", debug = debug_lftolx, prevent_optimization = True) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock(neg_input, Statement( ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision)) ), ConditionBlock(vx_nan_or_inf, ConditionBlock(vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement( ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid) ), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement( Return(ctz_result), ), Statement( Return(std_result) ) ) ) ) ) scheme = pre_scheme return scheme