def generate_tensor_check_loop(self, tensor_descriptors, input_tables, output_tables): # unpack tensor descriptors tuple (input_tensor_descriptor_list, output_tensor_descriptor_list) = tensor_descriptors # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_error_detail_function = self.get_printf_error_detail_fct( output_tensor_descriptor_list[0]) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_tables = self.generate_expected_table(tensor_descriptors, input_tables) # global statement to list all checks check_statement = Statement() # implement check for each output tensor for out_id, out_td in enumerate(output_tensor_descriptor_list): # expected values for the (vj)-th entry of the sub-array expected_values = [ TableLoad(expected_tables[out_id], vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-array local_result = TableLoad(output_tables[out_id], vj) array_len = out_td.get_bounding_size() if self.break_error: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, output_values)) else: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) check_statement.add(check_array_loop) return check_statement
def simplify_inverse(optree, processor): dummy_var = Variable("dummy_var_seed", precision = optree.get_precision()) dummy_div_seed = DivisionSeed(dummy_var, precision = optree.get_precision()) inv_approx_table = processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map) seed_input = optree.inputs[0] c0 = Constant(0, precision = ML_Int32) if optree.get_precision() == inv_approx_table.get_storage_precision(): return TableLoad(inv_approx_table, inv_approx_table.get_index_function()(seed_input), c0, precision = optree.get_precision()) else: return Conversion(TableLoad(inv_approx_table, inv_approx_table.get_index_function()(seed_input), c0, precision = inv_approx_table.get_storage_precision()), precision = optree.get_precision())
def legalize_invsqrt_seed(optree): """ Legalize an InverseSquareRootSeed optree """ assert isinstance(optree, ReciprocalSquareRootSeed) op_prec = optree.get_precision() # input = 1.m_hi-m_lo * 2^e # approx = 2^(-int(e/2)) * approx_insqrt(1.m_hi) * (e % 2 ? 1.0 : ~2**-0.5) op_input = optree.get_input(0) convert_back = False approx_prec = ML_Binary32 if op_prec != approx_prec: op_input = Conversion(op_input, precision=ML_Binary32) convert_back = True # TODO: fix integer precision selection # as we are in a late code generation stage, every node's precision # must be set op_exp = ExponentExtraction(op_input, tag="op_exp", debug=debug_multi, precision=ML_Int32) neg_half_exp = Division(Negation(op_exp, precision=ML_Int32), Constant(2, precision=ML_Int32), precision=ML_Int32) approx_exp = ExponentInsertion(neg_half_exp, tag="approx_exp", debug=debug_multi, precision=approx_prec) op_exp_parity = Modulo(op_exp, Constant(2, precision=ML_Int32), precision=ML_Int32) approx_exp_correction = Select(Equal(op_exp_parity, Constant(0, precision=ML_Int32)), Constant(1.0, precision=approx_prec), Select(Equal( op_exp_parity, Constant(-1, precision=ML_Int32)), Constant(S2**0.5, precision=approx_prec), Constant(S2**-0.5, precision=approx_prec), precision=approx_prec), precision=approx_prec, tag="approx_exp_correction", debug=debug_multi) table_index = invsqrt_approx_table.get_index_function()(op_input) table_index.set_attributes(tag="invsqrt_index", debug=debug_multi) approx = Multiplication(TableLoad(invsqrt_approx_table, table_index, precision=approx_prec), Multiplication(approx_exp_correction, approx_exp, precision=approx_prec), tag="invsqrt_approx", debug=debug_multi, precision=approx_prec) if approx_prec != op_prec: return Conversion(approx, precision=op_prec) else: return approx
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.get_input_precision(0)) bf16_params = ML_NewTable(dimensions=[self.table_size], storage_precision=BFloat16) for i in range(self.table_size): bf16_params[i] = 1.1**i conv_vx = Conversion(TableLoad(bf16_params, vx), precision=ML_Binary32, tag="conv_vx", debug=debug_multi) result = conv_vx scheme = Return(result, precision=self.precision, debug=debug_multi) return scheme
def legalize_reciprocal_seed(optree): """ Legalize an ReciprocalSeed optree """ assert isinstance(optree, ReciprocalSeed) op_prec = optree.get_precision() initial_prec = op_prec back_convert = False op_input = optree.get_input(0) INV_APPROX_TABLE_FORMAT = generic_inv_approx_table.get_storage_precision() if op_prec != INV_APPROX_TABLE_FORMAT: op_input = Conversion(op_input, precision=INV_APPROX_TABLE_FORMAT) op_prec = INV_APPROX_TABLE_FORMAT back_convert = True # input = 1.m_hi-m_lo * 2^e # approx = 2^(-int(e/2)) * approx_insqrt(1.m_hi) * (e % 2 ? 1.0 : ~2**-0.5) # TODO: fix integer precision selection # as we are in a late code generation stage, every node's precision # must be set int_prec = op_prec.get_integer_format() op_sign = CopySign(op_input, Constant(1.0, precision=op_prec), precision=op_prec) op_exp = ExponentExtraction(op_input, tag="op_exp", debug=debug_multi, precision=int_prec) neg_exp = Negation(op_exp, precision=int_prec) approx_exp = ExponentInsertion(neg_exp, tag="approx_exp", debug=debug_multi, precision=op_prec) table_index = generic_inv_approx_table.get_index_function()(op_input) table_index.set_attributes(tag="inv_index", debug=debug_multi) approx = Multiplication(TableLoad(generic_inv_approx_table, table_index, precision=op_prec), Multiplication(approx_exp, op_sign, precision=op_prec), tag="inv_approx", debug=debug_multi, precision=op_prec) if back_convert: return Conversion(approx, precision=initial_prec) else: return approx
def expand_accessor(accessor): """ Expand an accessor node into a valid MDL description """ if isinstance(accessor, ReadAccessor): # check dimensionnality: the number of sub-indexes in ReadAccessor's # index_expr must match the dimensionnality of ReadAccessor's tensor # tensor_descriptor return TableLoad(accessor.tensor.base_buffer, accessor.tensor.descriptor.generate_linearized_offset( accessor.index_expr), precision=accessor.value_format) elif isinstance(accessor, WriteAccessor): return TableStore( expand_kernel_expr(accessor.value_expr), accessor.tensor.base_buffer, accessor.tensor.descriptor.generate_linearized_offset( accessor.index_expr), precision=ML_Void, ) else: raise NotImplementedError
def piecewise_approximation(function, variable, precision, bound_low=-1.0, bound_high=1.0, num_intervals=16, max_degree=2, error_threshold=S2**-24, odd=False, even=False): """ Generate a piecewise approximation :param function: function to be approximated :type function: SollyaObject :param variable: input variable :type variable: Variable :param precision: variable's format :type precision: ML_Format :param bound_low: lower bound for the approximation interval :param bound_high: upper bound for the approximation interval :param num_intervals: number of sub-interval / sub-division of the main interval :param max_degree: maximum degree for an approximation on any sub-interval :param error_threshold: error bound for an approximation on any sub-interval :return: pair (scheme, error) where scheme is a graph node for an approximation scheme of function evaluated at variable, and error is the maximum approximation error encountered :rtype tuple(ML_Operation, SollyaObject): """ degree_generator = piecewise_approximation_degree_generator( function, bound_low, bound_high, num_intervals=num_intervals, error_threshold=error_threshold, ) degree_list = list(degree_generator) # if max_degree is None then we determine it locally if max_degree is None: max_degree = max(degree_list) # table to store coefficients of the approximation on each segment coeff_table = ML_NewTable( dimensions=[num_intervals, max_degree + 1], storage_precision=precision, tag="coeff_table", const=True # by default all approximation coeff table are const ) error_function = lambda p, f, ai, mod, t: sollya.dirtyinfnorm(p - f, ai) max_approx_error = 0.0 interval_size = (bound_high - bound_low) / num_intervals for i in range(num_intervals): subint_low = bound_low + i * interval_size subint_high = bound_low + (i + 1) * interval_size local_function = function(sollya.x + subint_low) local_interval = Interval(-interval_size, interval_size) local_degree = degree_list[i] if local_degree > max_degree: Log.report( Log.Warning, "local_degree {} exceeds max_degree bound ({}) in piecewise_approximation", local_degree, max_degree) # as max_degree defines the size of the table we can use # it as the degree for each sub-interval polynomial # as there is nothing to gain (yet) by using a smaller polynomial degree = max_degree # min(max_degree, local_degree) if function(subint_low) == 0.0: # if the lower bound is a zero to the function, we # need to force value=0 for the constant coefficient # and extend the approximation interval local_poly_degree_list = list( range(1 if even else 0, degree + 1, 2 if odd or even else 1)) poly_object, approx_error = Polynomial.build_from_approximation_with_error( function(sollya.x) / sollya.x, local_poly_degree_list, [precision] * len(local_poly_degree_list), Interval(-subint_high * 0.95, subint_high), sollya.absolute, error_function=error_function) # multiply by sollya.x poly_object = poly_object.sub_poly(offset=-1) else: try: poly_object, approx_error = Polynomial.build_from_approximation_with_error( local_function, degree, [precision] * (degree + 1), local_interval, sollya.absolute, error_function=error_function) except SollyaError as err: # try to see if function is constant on the interval (possible # failure cause for fpminmax) cst_value = precision.round_sollya_object( function(subint_low), sollya.RN) accuracy = error_threshold diff_with_cst_range = sollya.supnorm(cst_value, local_function, local_interval, sollya.absolute, accuracy) diff_with_cst = sup(abs(diff_with_cst_range)) if diff_with_cst < error_threshold: Log.report(Log.Info, "constant polynomial detected") poly_object = Polynomial([function(subint_low)] + [0] * degree) approx_error = diff_with_cst else: Log.report( Log.error, "degree: {} for index {}, diff_with_cst={} (vs error_threshold={}) ", degree, i, diff_with_cst, error_threshold, error=err) for ci in range(max_degree + 1): if ci in poly_object.coeff_map: coeff_table[i][ci] = poly_object.coeff_map[ci] else: coeff_table[i][ci] = 0.0 if approx_error > error_threshold: Log.report( Log.Warning, "piecewise_approximation on index {} exceeds error threshold: {} > {}", i, approx_error, error_threshold) max_approx_error = max(max_approx_error, abs(approx_error)) # computing offset diff = Subtraction(variable, Constant(bound_low, precision=precision), tag="diff", debug=debug_multi, precision=precision) int_prec = precision.get_integer_format() # delta = bound_high - bound_low delta_ratio = Constant(num_intervals / (bound_high - bound_low), precision=precision) # computing table index # index = nearestint(diff / delta * <num_intervals>) index = Max(0, Min( NearestInteger( Multiplication(diff, delta_ratio, precision=precision), precision=int_prec, ), num_intervals - 1), tag="index", debug=debug_multi, precision=int_prec) poly_var = Subtraction(diff, Multiplication( Conversion(index, precision=precision), Constant(interval_size, precision=precision)), precision=precision, tag="poly_var", debug=debug_multi) # generating indexed polynomial coeffs = [(ci, TableLoad(coeff_table, index, ci)) for ci in range(max_degree + 1)][::-1] poly_scheme = PolynomialSchemeEvaluator.generate_horner_scheme2( coeffs, poly_var, precision, {}, precision) return poly_scheme, max_approx_error
def generic_poly_split(offset_fct, indexing, target_eps, coeff_precision, vx): """ generate the meta approximation for @p offset_fct over several intervals defined by @p indexing object For each sub-interval, a polynomial approximation with maximal_error @p target_eps is tabulated, and evaluated using format @p coeff_precision. The input variable is @p vx """ # computing degree for a different polynomial approximation on each # sub-interval poly_degree_list = [ int(sup(guessdegree(offset_fct(offset), sub_interval, target_eps))) for offset, sub_interval in indexing.get_offseted_sub_list() ] poly_max_degree = max(poly_degree_list) # tabulating polynomial coefficients on split_num sub-interval of interval poly_table = ML_NewTable( dimensions=[indexing.split_num, poly_max_degree + 1], storage_precision=coeff_precision, const=True) offset_table = ML_NewTable(dimensions=[indexing.split_num], storage_precision=coeff_precision, const=True) max_error = 0.0 for sub_index in range(indexing.split_num): poly_degree = poly_degree_list[sub_index] offset, approx_interval = indexing.get_offseted_sub_interval(sub_index) offset_table[sub_index] = offset if poly_degree == 0: # managing constant approximation separately since it seems # to break sollya local_approx = coeff_precision.round_sollya_object( offset_fct(offset)(inf(approx_interval))) poly_table[sub_index][0] = local_approx for monomial_index in range(1, poly_max_degree + 1): poly_table[sub_index][monomial_index] = 0 approx_error = sollya.infnorm( offset_fct(offset) - local_approx, approx_interval) else: poly_object, approx_error = Polynomial.build_from_approximation_with_error( offset_fct(offset), poly_degree, [coeff_precision] * (poly_degree + 1), approx_interval, sollya.relative) for monomial_index in range(poly_max_degree + 1): if monomial_index <= poly_degree: poly_table[sub_index][ monomial_index] = poly_object.coeff_map[monomial_index] else: poly_table[sub_index][monomial_index] = 0 max_error = max(approx_error, max_error) Log.report(Log.Debug, "max approx error is {}", max_error) # indexing function: derive index from input @p vx value poly_index = indexing.get_index_node(vx) poly_index.set_attributes(tag="poly_index", debug=debug_multi) ext_precision = get_extended_fp_precision(coeff_precision) # building polynomial evaluation scheme offset = TableLoad(offset_table, poly_index, precision=coeff_precision, tag="offset", debug=debug_multi) poly = TableLoad(poly_table, poly_index, poly_max_degree, precision=coeff_precision, tag="poly_init", debug=debug_multi) red_vx = Subtraction(vx, offset, precision=vx.precision, tag="red_vx", debug=debug_multi) for monomial_index in range(poly_max_degree, -1, -1): coeff = TableLoad(poly_table, poly_index, monomial_index, precision=coeff_precision, tag="poly_%d" % monomial_index, debug=debug_multi) #fma_precision = coeff_precision if monomial_index > 1 else ext_precision fma_precision = coeff_precision poly = FMA(red_vx, poly, coeff, precision=fma_precision) #return Conversion(poly, precision=coeff_precision) #return poly.hi return poly
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) table_size_log = self.table_size_log integer_size = 31 integer_precision = ML_Int32 max_bound = sup(abs(self.input_intervals[0])) max_bound_log = int(ceil(log2(max_bound))) Log.report(Log.Info, "max_bound_log=%s " % max_bound_log) scaling_power = integer_size - max_bound_log Log.report(Log.Info, "scaling power: %s " % scaling_power) storage_precision = ML_Custom_FixedPoint_Format(1, 30, signed=True) Log.report(Log.Info, "tabulating cosine and sine") # cosine and sine fused table fused_table = ML_NewTable( dimensions=[2**table_size_log, 2], storage_precision=storage_precision, tag="fast_lib_shared_table") # self.uniquify_name("cossin_table")) # filling table for i in range(2**table_size_log): local_x = i / S2**table_size_log * S2**max_bound_log cos_local = cos( local_x ) # nearestint(cos(local_x) * S2**storage_precision.get_frac_size()) sin_local = sin( local_x ) # nearestint(sin(local_x) * S2**storage_precision.get_frac_size()) fused_table[i][0] = cos_local fused_table[i][1] = sin_local # argument reduction evaluation scheme # scaling_factor = Constant(S2**scaling_power, precision = self.precision) red_vx_precision = ML_Custom_FixedPoint_Format(31 - scaling_power, scaling_power, signed=True) Log.report( Log.Verbose, "red_vx_precision.get_c_bit_size()=%d" % red_vx_precision.get_c_bit_size()) # red_vx = NearestInteger(vx * scaling_factor, precision = integer_precision) red_vx = Conversion(vx, precision=red_vx_precision, tag="red_vx", debug=debug_fixed32) computation_precision = red_vx_precision # self.precision output_precision = self.get_output_precision() Log.report(Log.Info, "computation_precision is %s" % computation_precision) Log.report(Log.Info, "storage_precision is %s" % storage_precision) Log.report(Log.Info, "output_precision is %s" % output_precision) hi_mask_value = 2**32 - 2**(32 - table_size_log - 1) hi_mask = Constant(hi_mask_value, precision=ML_Int32) Log.report(Log.Info, "hi_mask=0x%x" % hi_mask_value) red_vx_hi_int = BitLogicAnd(TypeCast(red_vx, precision=ML_Int32), hi_mask, precision=ML_Int32, tag="red_vx_hi_int", debug=debugd) red_vx_hi = TypeCast(red_vx_hi_int, precision=red_vx_precision, tag="red_vx_hi", debug=debug_fixed32) red_vx_lo = red_vx - red_vx_hi red_vx_lo.set_attributes(precision=red_vx_precision, tag="red_vx_lo", debug=debug_fixed32) table_index = BitLogicRightShift(TypeCast(red_vx, precision=ML_Int32), scaling_power - (table_size_log - max_bound_log), precision=ML_Int32, tag="table_index", debug=debugd) tabulated_cos = TableLoad(fused_table, table_index, 0, tag="tab_cos", precision=storage_precision, debug=debug_fixed32) tabulated_sin = TableLoad(fused_table, table_index, 1, tag="tab_sin", precision=storage_precision, debug=debug_fixed32) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building polynomial approximation for cosine") # cosine polynomial approximation poly_interval = Interval(0, S2**(max_bound_log - table_size_log)) Log.report(Log.Info, "poly_interval=%s " % poly_interval) cos_poly_degree = 2 # int(sup(guessdegree(cos(x), poly_interval, accuracy_goal))) Log.report(Log.Verbose, "cosine polynomial approximation") cos_poly_object, cos_approx_error = Polynomial.build_from_approximation_with_error( cos(sollya.x), [0, 2], [0] + [computation_precision.get_bit_size()], poly_interval, sollya.absolute, error_function=error_function) #cos_eval_scheme = PolynomialSchemeEvaluator.generate_horner_scheme(cos_poly_object, red_vx_lo, unified_precision = computation_precision) Log.report(Log.Info, "cos_approx_error=%e" % cos_approx_error) cos_coeff_list = cos_poly_object.get_ordered_coeff_list() coeff_C0 = cos_coeff_list[0][1] coeff_C2 = Constant(cos_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) Log.report(Log.Info, "building polynomial approximation for sine") # sine polynomial approximation sin_poly_degree = 2 # int(sup(guessdegree(sin(x)/x, poly_interval, accuracy_goal))) Log.report(Log.Info, "sine poly degree: %e" % sin_poly_degree) Log.report(Log.Verbose, "sine polynomial approximation") sin_poly_object, sin_approx_error = Polynomial.build_from_approximation_with_error( sin(sollya.x) / sollya.x, [0, 2], [0] + [computation_precision.get_bit_size()] * (sin_poly_degree + 1), poly_interval, sollya.absolute, error_function=error_function) sin_coeff_list = sin_poly_object.get_ordered_coeff_list() coeff_S0 = sin_coeff_list[0][1] coeff_S2 = Constant(sin_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) # scheme selection between sine and cosine if self.cos_output: scheme = self.generate_cos_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) else: scheme = self.generate_sin_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) result = Conversion(scheme, precision=self.get_output_precision()) Log.report( Log.Verbose, "result operation tree :\n %s " % result.get_str( display_precision=True, depth=None, memoization_map={})) scheme = Statement(Return(result)) return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine precision_ptr = self.get_input_precision(0) index_format = self.get_input_precision(2) multi_elt_num = self.multi_elt_num dst = self.implementation.add_input_variable("dst", precision_ptr) src = self.implementation.add_input_variable("src", precision_ptr) n = self.implementation.add_input_variable("len", index_format) i = Variable("i", precision=index_format, var_type=Variable.Local) CU0 = Constant(0, precision=index_format) element_format = self.precision self.function_list = [] if multi_elt_num > 1: element_format = VECTOR_TYPE_MAP[self.precision][multi_elt_num] elt_input = TableLoad(src, i, precision=element_format) local_exp = Variable("local_exp", precision=element_format, var_type=Variable.Local) if self.use_libm_function: libm_fct_operator = FunctionOperator(self.use_libm_function, arity=1) libm_fct = FunctionObject(self.use_libm_function, [ML_Binary32], ML_Binary32, libm_fct_operator) if multi_elt_num > 1: result_list = [ libm_fct( VectorElementSelection(elt_input, Constant(elt_id, precision=ML_Integer), precision=self.precision)) for elt_id in range(multi_elt_num) ] result = VectorAssembling(*result_list, precision=element_format) else: result = libm_fct(elt_input) elt_result = ReferenceAssign(local_exp, result) else: if multi_elt_num > 1: scalar_result = Variable("scalar_result", precision=self.precision, var_type=Variable.Local) fct_ctor_args = self.function_ctor.get_default_args( precision=self.precision, libm_compliant=False, ) meta_function = self.function_ctor(fct_ctor_args) exponential_scheme = meta_function.generate_scheme() # instanciating required passes for typing pass_inst_abstract_prec = PassInstantiateAbstractPrecision( self.processor) pass_inst_prec = PassInstantiatePrecision( self.processor, default_precision=None) # exectuting format instanciation passes on optree exponential_scheme = pass_inst_abstract_prec.execute_on_optree( exponential_scheme) exponential_scheme = pass_inst_prec.execute_on_optree( exponential_scheme) vectorizer = StaticVectorizer() # extracting scalar argument from meta_exponential meta function scalar_input = meta_function.implementation.arg_list[0] # vectorize scalar scheme vector_result, vec_arg_list, vector_scheme, scalar_callback, scalar_callback_fct = vectorize_function_scheme( vectorizer, self.get_main_code_object(), exponential_scheme, element_format.get_scalar_format(), [scalar_input], multi_elt_num) elt_result = inline_function(vector_scheme, vector_result, {vec_arg_list[0]: elt_input}) local_exp = vector_result self.function_list.append(scalar_callback_fct) libm_fct = scalar_callback else: scalar_input = elt_input scalar_result = local_exp elt_result = generate_inline_fct_scheme( self.function_ctor, scalar_result, [scalar_input], { "precision": self.precision, "libm_compliant": False }) CU1 = Constant(1, precision=index_format) local_exp_init_value = Constant(0, precision=self.precision) if multi_elt_num > 1: local_exp_init_value = Constant([0] * multi_elt_num, precision=element_format) remain_n = Modulo(n, multi_elt_num, precision=index_format) iter_n = n - remain_n CU_ELTNUM = Constant(multi_elt_num, precision=index_format) inc = i + CU_ELTNUM else: remain_n = None iter_n = n inc = i + CU1 # main loop processing multi_elt_num element(s) per iteration main_loop = Loop( ReferenceAssign(i, CU0), i < iter_n, Statement(ReferenceAssign(local_exp, local_exp_init_value), elt_result, TableStore(local_exp, dst, i, precision=ML_Void), ReferenceAssign(i, inc)), ) # epilog to process remaining item (when the length is not a multiple # of multi_elt_num) if not remain_n is None: # TODO/FIXME: try alternative method for processing epilog # by using full vector length and mask epilog_loop = Loop( Statement(), i < n, Statement( TableStore(libm_fct( TableLoad(src, i, precision=self.precision)), dst, i, precision=ML_Void), ReferenceAssign(i, i + CU1), )) main_loop = Statement(main_loop, epilog_loop) return main_loop
def generate_scheme(self): # declaring target and instantiating optimization engine precision_ptr = self.get_input_precision(0) index_format = self.get_input_precision(2) dst = self.implementation.add_input_variable("dst", precision_ptr) src = self.implementation.add_input_variable("src", precision_ptr) n = self.implementation.add_input_variable("len", index_format) i = Variable("i", precision=index_format, var_type=Variable.Local) CU1 = Constant(1, precision=index_format) CU0 = Constant(0, precision=index_format) inc = i + CU1 elt_input = TableLoad(src, i, precision=self.precision) local_exp = Variable("local_exp", precision=self.precision, var_type=Variable.Local) if self.use_libm_function: libm_exp_operator = FunctionOperator("expf", arity=1) libm_exp = FunctionObject("expf", [ML_Binary32], ML_Binary32, libm_exp_operator) elt_result = ReferenceAssign(local_exp, libm_exp(elt_input)) else: exponential_args = ML_Exponential.get_default_args( precision=self.precision, libm_compliant=False, debug=False, ) meta_exponential = ML_Exponential(exponential_args) exponential_scheme = meta_exponential.generate_scheme() elt_result = inline_function( exponential_scheme, local_exp, {meta_exponential.implementation.arg_list[0]: elt_input}, ) elt_acc = Variable("elt_acc", precision=self.precision, var_type=Variable.Local) exp_loop = Loop( ReferenceAssign(i, CU0), i < n, Statement(ReferenceAssign(local_exp, 0), elt_result, TableStore(local_exp, dst, i, precision=ML_Void), ReferenceAssign(elt_acc, elt_acc + local_exp), ReferenceAssign(i, i + CU1)), ) sum_rcp = Division(1, elt_acc, precision=self.precision, tag="sum_rcp", debug=debug_multi) div_loop = Loop( ReferenceAssign(i, CU0), i < n, Statement( TableStore(Multiplication( TableLoad(dst, i, precision=self.precision), sum_rcp), dst, i, precision=ML_Void), ReferenceAssign(i, inc)), ) main_scheme = Statement(ReferenceAssign(elt_acc, 0), exp_loop, sum_rcp, div_loop) return main_scheme
def get_array_test_wrapper(self, test_num, tested_function, table_size_offset_array, input_tables, output_array, acc_num, post_statement_generator, NUM_INPUT_ARRAY=1): """ generate a test loop for multi-array tests @param test_num number of elementary array tests to be executed @param tested_function FunctionObject to be tested @param table_size_offset_array ML_NewTable object containing (table-size, offset) pairs for multi-array testing @param input_table ML_NewTable containing multi-array test inputs @param output_table ML_NewTable containing multi-array test outputs @param post_statement_generator is generator used to generate a statement executed at the end of the test of one of the arrays of the multi-test. It expects 6 arguments: (input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id) @param printf_function FunctionObject to print error case """ test_id = Variable("test_id", precision=ML_Int32, var_type=Variable.Local) test_num_cst = Constant(test_num, precision=ML_Int32, tag="test_num") array_len = Variable("len", precision=ML_UInt32, var_type=Variable.Local) array_offset = TableLoad(table_size_offset_array, test_id, 1) def pointer_add(table_addr, offset): pointer_format = table_addr.get_precision_as_pointer_format() return Addition(table_addr, offset, precision=pointer_format) array_inputs = tuple( pointer_add(input_tables[in_id], array_offset) for in_id in range(NUM_INPUT_ARRAY)) function_call = tested_function( *((pointer_add(output_array, array_offset), ) + array_inputs + (array_len, ))) post_statement = post_statement_generator(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id) loop_increment = 1 test_loop = Loop( ReferenceAssign(test_id, Constant(0, precision=ML_Int32)), test_id < test_num_cst, Statement( ReferenceAssign(array_len, TableLoad(table_size_offset_array, test_id, 0)), function_call, post_statement, ReferenceAssign( acc_num, acc_num + Conversion(array_len, precision=acc_num.precision)), ReferenceAssign(test_id, test_id + loop_increment), ), ) test_statement = Statement() # adding functional test_loop to test statement test_statement.add(test_loop) return test_statement
def generate_array_check_loop(self, input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_input_function = self.get_printf_input_function() printf_error_template = "printf(\"max %s error is %s \\n\", %s)" % ( self.function_name, self.precision.get_display_format().format_string, self.precision.get_display_format().pre_process_fct("{0}")) printf_error_op = TemplateOperatorFormat(printf_error_template, arity=1, void_function=True, require_header=["stdio.h"]) printf_error_function = FunctionObject("printf", [self.precision], ML_Void, printf_error_op) printf_max_op = FunctionOperator( "printf", arg_map={ 0: "\"max %s error is reached at input number %s \\n \"" % (self.function_name, "%d"), 1: FO_Arg(0) }, void_function=True, require_header=["stdio.h"]) printf_max_function = FunctionObject("printf", [self.precision], ML_Void, printf_max_op) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_table = self.generate_expected_table(input_tables, table_size_offset_array) # inputs for the (vj)-th entry of the sub-arrat local_inputs = tuple( TableLoad(input_tables[in_id], array_offset + vj) for in_id in range(NUM_INPUT_ARRAY)) # expected values for the (vj)-th entry of the sub-arrat expected_values = [ TableLoad(expected_table, array_offset + vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-arrat local_result = TableLoad(output_array, array_offset + vj) if self.break_error: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, output_values)) else: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) # loop implementation to check sub-array array_offset # results validity check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) return check_array_loop
def piecewise_approximation(function, variable, precision, bound_low=-1.0, bound_high=1.0, num_intervals=16, max_degree=2, error_threshold=sollya.S2**-24): """ To be documented """ # table to store coefficients of the approximation on each segment coeff_table = ML_NewTable(dimensions=[num_intervals, max_degree + 1], storage_precision=precision, tag="coeff_table") error_function = lambda p, f, ai, mod, t: sollya.dirtyinfnorm(p - f, ai) max_approx_error = 0.0 interval_size = (bound_high - bound_low) / num_intervals for i in range(num_intervals): subint_low = bound_low + i * interval_size subint_high = bound_low + (i + 1) * interval_size #local_function = function(sollya.x) #local_interval = Interval(subint_low, subint_high) local_function = function(sollya.x + subint_low) local_interval = Interval(-interval_size, interval_size) local_degree = sollya.guessdegree(local_function, local_interval, error_threshold) degree = min(max_degree, local_degree) if function(subint_low) == 0.0: # if the lower bound is a zero to the function, we # need to force value=0 for the constant coefficient # and extend the approximation interval degree_list = range(1, degree + 1) poly_object, approx_error = Polynomial.build_from_approximation_with_error( function(sollya.x), degree_list, [precision] * len(degree_list), Interval(-subint_high, subint_high), sollya.absolute, error_function=error_function) else: try: poly_object, approx_error = Polynomial.build_from_approximation_with_error( local_function, degree, [precision] * (degree + 1), local_interval, sollya.absolute, error_function=error_function) except SollyaError as err: print("degree: {}".format(degree)) raise err for ci in range(degree + 1): if ci in poly_object.coeff_map: coeff_table[i][ci] = poly_object.coeff_map[ci] else: coeff_table[i][ci] = 0.0 max_approx_error = max(max_approx_error, abs(approx_error)) # computing offset diff = Subtraction(variable, Constant(bound_low, precision=precision), tag="diff", precision=precision) # delta = bound_high - bound_low delta_ratio = Constant(num_intervals / (bound_high - bound_low), precision=precision) # computing table index # index = nearestint(diff / delta * <num_intervals>) index = Max(0, Min( NearestInteger(Multiplication(diff, delta_ratio, precision=precision), precision=ML_Int32), num_intervals - 1), tag="index", debug=True, precision=ML_Int32) poly_var = Subtraction(diff, Multiplication( Conversion(index, precision=precision), Constant(interval_size, precision=precision)), precision=precision, tag="poly_var", debug=True) # generating indexed polynomial coeffs = [(ci, TableLoad(coeff_table, index, ci)) for ci in range(degree + 1)][::-1] poly_scheme = PolynomialSchemeEvaluator.generate_horner_scheme2( coeffs, poly_var, precision, {}, precision) return poly_scheme, max_approx_error