def test_ref_assign(self): """ test behavior of StaticVectorizer on predicated ReferenceAssign """ va = Variable("a") vb = Variable("b") vc = Variable("c") scheme = Statement( ReferenceAssign(va, Constant(3)), ConditionBlock( (va > vb).modify_attributes(likely=True), Statement(ReferenceAssign(vb, va), ReferenceAssign(va, Constant(11)), Return(va)), ), ReferenceAssign(va, Constant(7)), Return(vb)) vectorized_path = StaticVectorizer().extract_vectorizable_path( scheme, fallback_policy) linearized_most_likely_path = instanciate_variable( vectorized_path.linearized_optree, vectorized_path.variable_mapping) test_result = (isinstance(linearized_most_likely_path, Constant) and linearized_most_likely_path.get_value() == 11) if not test_result: print("test UT_StaticVectorizer failure") print("scheme: {}".format(scheme.get_str())) print("linearized_most_likely_path: {}".format( linearized_most_likely_path)) self.assertTrue(test_result)
def generate_scheme(self): size_format = ML_Int32 # Matrix storage in_storage = self.implementation.add_input_variable( "buffer_in", ML_Pointer_Format(self.precision)) kernel_storage = self.implementation.add_input_variable( "buffer_kernel", ML_Pointer_Format(self.precision)) out_storage = self.implementation.add_input_variable( "buffer_out", ML_Pointer_Format(self.precision)) # Matrix sizes w = self.implementation.add_input_variable("w", size_format) h = self.implementation.add_input_variable("h", size_format) # A is a (n x p) matrix in row-major tIn = Tensor(in_storage, TensorDescriptor([w, h], [1, w], self.precision)) # B is a (p x m) matrix in row-major kernel_strides = [1] for previous_dim in self.kernel_size[:-1]: kernel_strides.append(previous_dim * kernel_strides[-1]) print("kernel_strides: {}".format(kernel_strides)) tKernel = Tensor( kernel_storage, TensorDescriptor(self.kernel_size, kernel_strides, self.precision)) # C is a (n x m) matrix in row-major tOut = Tensor(out_storage, TensorDescriptor([w, h], [1, w], self.precision)) index_format = ML_Int32 # main NDRange description i = Variable("i", precision=index_format, var_type=Variable.Local) j = Variable("j", precision=index_format, var_type=Variable.Local) k_w = Variable("k_w", precision=index_format, var_type=Variable.Local) k_h = Variable("k_h", precision=index_format, var_type=Variable.Local) result = NDRange([IterRange(i, 0, w - 1), IterRange(j, 0, h - 1)], WriteAccessor( tOut, [i, j], Sum(Sum(Multiplication( ReadAccessor(tIn, [i + k_w, j - k_h], self.precision), ReadAccessor(tKernel, [k_w, k_h], self.precision)), IterRange(k_w, -(self.kernel_size[0] - 1) // 2, (self.kernel_size[0] - 1) // 2), precision=self.precision), IterRange(k_h, -(self.kernel_size[1] - 1) // 2, (self.kernel_size[1] - 1) // 2), precision=self.precision))) mdl_scheme = expand_ndrange(result) print("mdl_scheme:\n{}".format(mdl_scheme.get_str(depth=None))) return Statement(mdl_scheme, Return())
def generate_scheme(self): size_format = ML_Int32 # Matrix storage A_storage = self.implementation.add_input_variable("buffer_a", ML_Pointer_Format(self.precision)) B_storage = self.implementation.add_input_variable("buffer_b", ML_Pointer_Format(self.precision)) C_storage = self.implementation.add_input_variable("buffer_c", ML_Pointer_Format(self.precision)) # Matrix sizes n = self.implementation.add_input_variable("n", size_format) m = self.implementation.add_input_variable("m", size_format) p = self.implementation.add_input_variable("p", size_format) # A is a (n x p) matrix in row-major tA = Tensor(A_storage, TensorDescriptor([p, n], [1, p], self.precision)) # B is a (p x m) matrix in row-major tB = Tensor(B_storage, TensorDescriptor([m, p], [1, m], self.precision)) # C is a (n x m) matrix in row-major tC = Tensor(C_storage, TensorDescriptor([m, n], [1, m], self.precision)) index_format = ML_Int32 # i = Variable("i", precision=index_format, var_type=Variable.Local) j = Variable("j", precision=index_format, var_type=Variable.Local) k = Variable("k", precision=index_format, var_type=Variable.Local) result = NDRange( [IterRange(j, 0, m-1), IterRange(i, 0, n -1)], WriteAccessor( tC, [j, i], Sum( Multiplication( ReadAccessor(tA, [k, i], self.precision), ReadAccessor(tB, [j, k], self.precision), precision=self.precision), IterRange(k, 0, p - 1), precision=self.precision))) #mdl_scheme = expand_ndrange(exchange_loop_order(tile_ndrange(result, {j: 2, i: 2}), [1, 0])) if self.vectorize: mdl_scheme = expand_ndrange(vectorize_ndrange(result, j, 4)) else: mdl_scheme = expand_ndrange(exchange_loop_order(tile_ndrange(result, {j: 2, i: 2}), [1, 0])) print("mdl_scheme:\n{}".format(mdl_scheme.get_str(depth=None, display_precision=True))) return Statement( mdl_scheme, Return() )
def __init__(self, register_id, register_format, reg_tag, var_tag=None, **kw): """ register tag is stored as inner Variable's name and original variable's name is stored in self.var_tag """ #reg_tag = "unamed-reg" if reg_tag is None else reg_tag # indirection toward register's tag (if _reg_tag's value is None, then # register tag is undefined) self._reg_tag = reg_tag Variable.__init__(self, self.reg_tag, precision=register_format, **kw) self.var_tag = var_tag self.register_id = register_id
def generate_test_wrapper(self, tensor_descriptors, input_tables, output_tables): auto_test = CodeFunction("test_wrapper", output_format=ML_Int32) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True, require_header=["stdio.h"]) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) test_loop = self.get_tensor_test_wrapper( tested_function, tensor_descriptors, input_tables, output_tables, acc_num, self.generate_tensor_check_loop) # common test scheme between scalar and vector functions test_scheme = Statement(test_loop, printf_success_function(), Return(Constant(0, precision=ML_Int32))) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def tile_ndrange(ndrange, tile, index_format=ML_Int32): """ inplace transform ndrange such that it iterate over a sub-tile of size tile rather than a single element tile is a dict(var_index -> tile_dim) """ # The transformation is performed by replacing each range # implicating one of the variable from tile, by a range whose step is the tile's dimension # and then adding a sub-iterange using a sub-alias for the tile's variable whose range # is [0; tile's dimension - 1] new_var_range_list = [] var_transform_map = {} kernel_var_range_list = [] # transform var_range_list for iter_range in ndrange.var_range_list: var_index = iter_range.var_index if var_index in tile: tile_dim = tile[var_index] new_iter_range = IterRange(var_index, iter_range.first_index, iter_range.last_index, index_step=tile_dim) new_var_range_list.append(new_iter_range) sub_var = Variable("sub_%s" % var_index.get_tag(), precision=index_format, var_type=Variable.Local) sub_var_range = IterRange(sub_var, var_index, var_index + tile_dim - 1) kernel_var_range_list.append(sub_var_range) var_transform_map[iter_range.var_index] = sub_var_range else: new_var_range_list.append(iter_range) # tile kernel new_kernel = substitute_var(ndrange.kernel, var_transform_map) sub_ndrange = NDRange(kernel_var_range_list, new_kernel) return NDRange(new_var_range_list, sub_ndrange)
def generate_inline_scheme(self, vx): """ generate a pair <variable, scheme> scheme is the operation graph to compute self function on vx and variable is the result variable """ result_var = Variable("r", precision=self.get_precision(), var_type=Variable.Local) scalar_scheme = self.generate_scalar_scheme(vx) result_scheme = inline_function(scalar_scheme, result_var, {vx: vx}) return result_var, result_scheme
def generate_tensor_check_loop(self, tensor_descriptors, input_tables, output_tables): # unpack tensor descriptors tuple (input_tensor_descriptor_list, output_tensor_descriptor_list) = tensor_descriptors # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_error_detail_function = self.get_printf_error_detail_fct( output_tensor_descriptor_list[0]) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_tables = self.generate_expected_table(tensor_descriptors, input_tables) # global statement to list all checks check_statement = Statement() # implement check for each output tensor for out_id, out_td in enumerate(output_tensor_descriptor_list): # expected values for the (vj)-th entry of the sub-array expected_values = [ TableLoad(expected_tables[out_id], vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-array local_result = TableLoad(output_tables[out_id], vj) array_len = out_td.get_bounding_size() if self.break_error: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, output_values)) else: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) check_statement.add(check_array_loop) return check_statement
def rec_bb_processing(bb): """ perform variable renaming in the basic block @p bb and recursivly in bb's children in the dominator tree """ Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "processing bb {}", bb) # because a node can be duplicated between # its declaration and its use in a subsequent operation in the same # basic block, we must make sure it is processed only once # by update_used_var for a given <var>. Thus for each # <var> we store a memoization_map of processed nodes updated_used_var_memoization_map = {} def get_mem_map(var): """ return the updated_used_var memoization_map associated to @p var """ if not var in updated_used_var_memoization_map: updated_used_var_memoization_map[var] = {} return updated_used_var_memoization_map[var] for op in bb.get_inputs(): if op in memoization_map: continue else: memoization_map[op] = None Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "processing op {}", op) if not isinstance(op, PhiNode): for var in get_var_used_by_non_phi(op): Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "processing var {} used by non-phi node", var) updating_reaching_def(bbg, reaching_def, var, op) Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "updating var from {} to {} used by non-phi node", var, reaching_def[var]) local_mem_map = get_mem_map(var) update_used_var(op, var, reaching_def[var], memoization_map=local_mem_map) # to avoid multiple update we add the output memoization_table # to the table of the destination variable # so the last time the destination variable is considered for update # it will discard all update made during this BB processing get_mem_map(reaching_def[var]).update(local_mem_map) for var in get_var_def_by_op(op): updating_reaching_def(bbg, reaching_def, var, op) vp = Variable("%s_%d" % (var.get_tag(), new_var_index(var)), precision=var.get_precision()) # TODO: tag update_def_var(op, var, vp) reaching_def[vp] = reaching_def[var] reaching_def[var] = vp bbg.variable_defs[vp] = op Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "processing phi in successor") for phi in get_phi_list_in_bb_successor(bb): for index, var, var_bb in get_indexed_var_used_by_phi(phi): Log.report(LOG_LEVEL_GEN_BB_VERBOSE, "processing operand #{} of phi: {}, var_bb is {}", index, var, var_bb) if not isinstance(var_bb, EmptyOperand): continue # updating_reaching_def(bbg, reaching_def, var, phi) update_indexed_used_var_in_phi(phi, index, var, reaching_def[var], bb) break # finally traverse sub-tree if bb in bbg.dominator_tree: for child in bbg.dominator_tree[bb]: rec_bb_processing(child)
def simplify_inverse(optree, processor): dummy_var = Variable("dummy_var_seed", precision = optree.get_precision()) dummy_div_seed = DivisionSeed(dummy_var, precision = optree.get_precision()) inv_approx_table = processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map) seed_input = optree.inputs[0] c0 = Constant(0, precision = ML_Int32) if optree.get_precision() == inv_approx_table.get_storage_precision(): return TableLoad(inv_approx_table, inv_approx_table.get_index_function()(seed_input), c0, precision = optree.get_precision()) else: return Conversion(TableLoad(inv_approx_table, inv_approx_table.get_index_function()(seed_input), c0, precision = inv_approx_table.get_storage_precision()), precision = optree.get_precision())
def generate_scalar_scheme(self, vx, vy): div = Division(vx, vy, precision=self.precision) div_if = Trunc(div, precision=self.precision) rem = Variable("rem", var_type=Variable.Local, precision=self.precision) qi = Variable("qi", var_type=Variable.Local, precision=self.precision) qi_bound = Constant(S2**self.precision.get_mantissa_size()) init_rem = FusedMultiplyAdd(-div_if, vy, vx) # factorizing 1 / vy to save time # NOTES: it makes rem / vy approximate # shared_rcp = Division(1, vy, precision=self.precision) iterative_fmod = Loop( Statement( ReferenceAssign(rem, init_rem), ReferenceAssign(qi, div_if), ), Abs(qi) > qi_bound, Statement( ReferenceAssign( qi, #Trunc(shared_rcp * rem, precision=self.precision) Trunc(rem / vy, precision=self.precision)), ReferenceAssign(rem, FMA(-qi, vy, rem)))) scheme = Statement( rem, # shared_rcp, iterative_fmod, ConditionBlock( # if rem's sign and vx sign mismatch (rem * vx < 0.0).modify_attributes(tag="update_cond", debug=debug_multi), Return(rem + vy), Return(rem), )) return scheme
def instanciate_fct_call(node, precision): """ replace FunctionCall node by the actual function scheme """ vx_list = [ node.get_input(i) for i in range(node.get_function_object().arity) ] func_name = node.get_function_object().name fct_ctor, fct_args, fct_range_function = FUNCTION_MAP[func_name] var_result = Variable("local_result", precision=precision, var_type=Variable.Local) local_args = {"precision": precision, "libm_compliant": False} local_args.update(fct_args) fct_scheme = generate_inline_fct_scheme(fct_ctor, var_result, vx_list, local_args) return var_result, fct_scheme
def get_tensor_test_wrapper(self, tested_function, tensor_descriptors, input_tables, output_tables, acc_num, post_statement_generator, NUM_INPUT_ARRAY=1): """ generate a test loop for multi-array tests @param test_num number of elementary array tests to be executed @param tested_function FunctionObject to be tested @param table_size_offset_array ML_NewTable object containing (table-size, offset) pairs for multi-array testing @param input_table ML_NewTable containing multi-array test inputs @param output_table ML_NewTable containing multi-array test outputs @param post_statement_generator is generator used to generate a statement executed at the end of the test of one of the arrays of the multi-test. It expects 6 arguments: (input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id) @param printf_function FunctionObject to print error case """ array_len = Variable("len", precision=ML_UInt32, var_type=Variable.Local) def pointer_add(table_addr, offset): pointer_format = table_addr.get_precision_as_pointer_format() return Addition(table_addr, offset, precision=pointer_format) array_inputs = tuple(input_tables[in_id] for in_id in range(NUM_INPUT_ARRAY)) function_call = tested_function(*(self.get_ordered_arg_tuple( tensor_descriptors, input_tables, output_tables))) post_statement = post_statement_generator(tensor_descriptors, input_tables, output_tables) test_statement = Statement( function_call, post_statement, ) return test_statement
def expand_kernel_expr(kernel, iterator_format=ML_Int32): """ Expand a kernel expression into the corresponding MDL graph """ if isinstance(kernel, NDRange): return expand_ndrange(kernel) elif isinstance(kernel, Sum): var_iter = kernel.index_iter_range.var_index # TODO/FIXME to be uniquified acc = Variable("acc", var_type=Variable.Local, precision=kernel.precision) # TODO/FIXME implement proper acc init if kernel.precision.is_vector_format(): C0 = Constant([0] * kernel.precision.get_vector_size(), precision=kernel.precision) else: C0 = Constant(0, precision=kernel.precision) scheme = Loop( Statement( ReferenceAssign(var_iter, kernel.index_iter_range.first_index), ReferenceAssign(acc, C0)), var_iter <= kernel.index_iter_range.last_index, Statement( ReferenceAssign( acc, Addition(acc, expand_kernel_expr(kernel.elt_operation), precision=kernel.precision)), # loop iterator increment ReferenceAssign(var_iter, var_iter + kernel.index_iter_range.index_step))) return PlaceHolder(acc, scheme) elif isinstance(kernel, (ReadAccessor, WriteAccessor)): return expand_accessor(kernel) elif is_leaf_node(kernel): return kernel else: # vanilla metalibm ops are left unmodified (except # recursive expansion) for index, op in enumerate(kernel.inputs): new_op = expand_kernel_expr(op) kernel.set_input(index, new_op) return kernel
def vectorize_function_scheme(vectorizer, name_factory, scalar_scheme, scalar_output_format, scalar_arg_list, vector_size, sub_vector_size=None): """ Use a vectorization engine @p vectorizer to vectorize the sub-graph @p scalar_scheme, that is transforming and inputs and outputs from scalar to vectors and performing required internal path duplication """ sub_vector_size = vector_size if sub_vector_size is None else sub_vector_size vec_arg_list, vector_scheme, vector_mask = \ vectorizer.vectorize_scheme(scalar_scheme, scalar_arg_list, vector_size, sub_vector_size) vector_output_format = vectorize_format(scalar_output_format, vector_size) vec_res = Variable("vec_res", precision=vector_output_format, var_type=Variable.Local) vector_mask.set_attributes(tag="vector_mask", debug=debug_multi) callback_name = "scalar_callback" scalar_callback_fct = generate_function_from_optree( name_factory, scalar_scheme, scalar_arg_list, callback_name, scalar_output_format) scalar_callback = scalar_callback_fct.get_function_object() if no_scalar_fallback_required(vector_mask): function_scheme = Statement( Return(vector_scheme, precision=vector_output_format)) function_scheme = generate_c_vector_wrapper(vector_size, vec_arg_list, vector_scheme, vector_mask, vec_res, scalar_callback) return vec_res, vec_arg_list, function_scheme, scalar_callback, scalar_callback_fct
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def generate_scheme(self): """ main scheme generation """ int_size = 3 frac_size = self.width - int_size input_precision = fixed_point(int_size, frac_size) output_precision = fixed_point(int_size, frac_size) expected_interval = {} # declaring main input variable var_x = self.implementation.add_input_signal("x", input_precision) x_interval = Interval(-10.3,10.7) var_x.set_interval(x_interval) expected_interval[var_x] = x_interval var_y = self.implementation.add_input_signal("y", input_precision) y_interval = Interval(-17.9,17.2) var_y.set_interval(y_interval) expected_interval[var_y] = y_interval var_z = self.implementation.add_input_signal("z", input_precision) z_interval = Interval(-7.3,7.7) var_z.set_interval(z_interval) expected_interval[var_z] = z_interval cst = Constant(42.5, tag = "cst") expected_interval[cst] = Interval(42.5) conv_ceil = Ceil(var_x, tag = "ceil") expected_interval[conv_ceil] = sollya.ceil(x_interval) conv_floor = Floor(var_y, tag = "floor") expected_interval[conv_floor] = sollya.floor(y_interval) mult = var_z * var_x mult.set_tag("mult") mult_interval = z_interval * x_interval expected_interval[mult] = mult_interval large_add = (var_x + var_y) - mult large_add.set_attributes(tag = "large_add") large_add_interval = (x_interval + y_interval) - mult_interval expected_interval[large_add] = large_add_interval var_x_lzc = CountLeadingZeros(var_x, tag="var_x_lzc") expected_interval[var_x_lzc] = Interval(0, input_precision.get_bit_size()) reduced_result = Max(0, Min(large_add, 13)) reduced_result.set_tag("reduced_result") reduced_result_interval = interval_max( Interval(0), interval_min( large_add_interval, Interval(13) ) ) expected_interval[reduced_result] = reduced_result_interval select_result = Select( var_x > var_y, reduced_result, var_z, tag = "select_result" ) select_interval = interval_union(reduced_result_interval, z_interval) expected_interval[select_result] = select_interval # floating-point operation on mantissa and exponents fp_x_range = Interval(-0.01, 100) unbound_fp_var = Variable("fp_x", precision=ML_Binary32, interval=fp_x_range) mant_fp_x = MantissaExtraction(unbound_fp_var, tag="mant_fp_x", precision=ML_Binary32) exp_fp_x = ExponentExtraction(unbound_fp_var, tag="exp_fp_x", precision=ML_Int32) ins_exp_fp_x = ExponentInsertion(exp_fp_x, tag="ins_exp_fp_x", precision=ML_Binary32) expected_interval[unbound_fp_var] = fp_x_range expected_interval[exp_fp_x] = Interval( sollya.floor(sollya.log2(sollya.inf(abs(fp_x_range)))), sollya.floor(sollya.log2(sollya.sup(abs(fp_x_range)))) ) expected_interval[mant_fp_x] = Interval(1, 2) expected_interval[ins_exp_fp_x] = Interval( S2**sollya.inf(expected_interval[exp_fp_x]), S2**sollya.sup(expected_interval[exp_fp_x]) ) # checking interval evaluation for var in [var_x_lzc, exp_fp_x, unbound_fp_var, mant_fp_x, ins_exp_fp_x, cst, var_x, var_y, mult, large_add, reduced_result, select_result, conv_ceil, conv_floor]: interval = evaluate_range(var) expected = expected_interval[var] print("{}: {}".format(var.get_tag(), interval)) print(" vs expected {}".format(expected)) assert not interval is None assert interval == expected return [self.implementation]
return Addition( Constant(cst0_rounded, precision=cst0_format), Multiplication(var_node, poly_node, precision=mul_format), precision=add_format), add_format.epsilon # TODO: local error only else: Log.report(Log.Error, "poly degree must be positive or null. {}, {}", poly_object.degree, poly_object) if __name__ == "__main__": implem_results = [] for eps_target in [S2**-40, S2**-50, S2**-55, S2**-60, S2**-65]: approx_interval = Interval(-S2**-5, S2**-5) ctx = MLL_Context(ML_Binary64, approx_interval) vx = Variable("x", precision=ctx.variableFormat, interval=approx_interval) # guessding the best degree poly_degree = int( sup( sollya.guessdegree(sollya.exp(sollya.x), approx_interval, eps_target))) # asking sollya to provide the approximation poly_object = Polynomial.build_from_approximation( sollya.exp(sollya.x), poly_degree, [sollya.doubledouble] * (poly_degree + 1), vx.interval) print("poly object is {}".format(poly_object)) poly_graph, poly_epsilon = mll_implementpoly_horner( ctx, poly_object, eps_target, vx) print("poly_graph is {}".format( poly_graph.get_str(depth=None, display_precision=True)))
Statement( expand_sub_ndrange(var_range_list, kernel), # loop iterator increment ReferenceAssign(var_range.var_index, var_range.var_index + var_range.index_step)), ) return scheme return expand_sub_ndrange(ndrange.var_range_list, ndrange.kernel) if __name__ == "__main__": size_format = ML_Int32 # Matrix sizes n = Variable("n", precision=size_format) m = Variable("m", precision=size_format) p = Variable("p", precision=size_format) from metalibm_core.core.ml_formats import ML_Binary32 precision = ML_Binary32 # A is a (n x p) matrix in row-major tA = Tensor(None, TensorDescriptor([p, n], [1, p], precision)) # B is a (p x m) matrix in row-major tB = Tensor(None, TensorDescriptor([m, p], [1, m], precision)) # C is a (n x m) matrix in row-major tC = Tensor(None, TensorDescriptor([m, n], [1, m], precision)) index_format = ML_Int32
def generate_array_check_loop(self, input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_input_function = self.get_printf_input_function() printf_error_template = "printf(\"max %s error is %s \\n\", %s)" % ( self.function_name, self.precision.get_display_format().format_string, self.precision.get_display_format().pre_process_fct("{0}")) printf_error_op = TemplateOperatorFormat(printf_error_template, arity=1, void_function=True, require_header=["stdio.h"]) printf_error_function = FunctionObject("printf", [self.precision], ML_Void, printf_error_op) printf_max_op = FunctionOperator( "printf", arg_map={ 0: "\"max %s error is reached at input number %s \\n \"" % (self.function_name, "%d"), 1: FO_Arg(0) }, void_function=True, require_header=["stdio.h"]) printf_max_function = FunctionObject("printf", [self.precision], ML_Void, printf_max_op) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_table = self.generate_expected_table(input_tables, table_size_offset_array) # inputs for the (vj)-th entry of the sub-arrat local_inputs = tuple( TableLoad(input_tables[in_id], array_offset + vj) for in_id in range(NUM_INPUT_ARRAY)) # expected values for the (vj)-th entry of the sub-arrat expected_values = [ TableLoad(expected_table, array_offset + vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-arrat local_result = TableLoad(output_array, array_offset + vj) if self.break_error: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, output_values)) else: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) # loop implementation to check sub-array array_offset # results validity check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) return check_array_loop
def generate_scalar_scheme(self, vx, vy): # fixing inputs' node tag vx.set_attributes(tag="x") vy.set_attributes(tag="y") int_precision = self.precision.get_integer_format() # assuming x = m.2^e (m in [1, 2[) # n, positive or null integers # # pow(x, n) = x^(y) # = exp(y * log(x)) # = 2^(y * log2(x)) # = 2^(y * (log2(m) + e)) # e = ExponentExtraction(vx, tag="e", precision=int_precision) m = MantissaExtraction(vx, tag="m", precision=self.precision) # approximation log2(m) # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter= lambda self: self.approx_table_map) log_f = sollya.log(sollya.x) # /sollya.log(self.basis) ml_log_args = ML_GenericLog.get_default_args(precision=self.precision, basis=2) ml_log = ML_GenericLog(ml_log_args) log_table, log_table_tho, table_index_range = ml_log.generate_log_table(log_f, inv_approx_table) log_approx = ml_log.generate_reduced_log_split(Abs(m, precision=self.precision), log_f, inv_approx_table, log_table) log_approx = Select(Equal(vx, 0), FP_MinusInfty(self.precision), log_approx) log_approx.set_attributes(tag="log_approx", debug=debug_multi) r = Multiplication(log_approx, vy, tag="r", debug=debug_multi) # 2^(y * (log2(m) + e)) = 2^(y * log2(m)) * 2^(y * e) # # log_approx = log2(Abs(m)) # r = y * log_approx ~ y * log2(m) # # NOTES: manage cases where e is negative and # (y * log2(m)) AND (y * e) could cancel out # if e positive, whichever the sign of y (y * log2(m)) and (y * e) CANNOT # be of opposite signs # log2(m) in [0, 1[ so cancellation can occur only if e == -1 # we split 2^x in 2^x = 2^t0 * 2^t1 # if e < 0: t0 = y * (log2(m) + e), t1=0 # else: t0 = y * log2(m), t1 = y * e t_cond = e < 0 # e_y ~ e * y e_f = Conversion(e, precision=self.precision) #t0 = Select(t_cond, (e_f + log_approx) * vy, Multiplication(e_f, vy), tag="t0") #NearestInteger(t0, precision=self.precision, tag="t0_int") EY = NearestInteger(e_f * vy, tag="EY", precision=self.precision) LY = NearestInteger(log_approx * vy, tag="LY", precision=self.precision) t0_int = Select(t_cond, EY + LY, EY, tag="t0_int") t0_frac = Select(t_cond, FMA(e_f, vy, -EY) + FMA(log_approx, vy, -LY) ,EY - t0_int, tag="t0_frac") #t0_frac.set_attributes(tag="t0_frac") ml_exp2_args = ML_Exp2.get_default_args(precision=self.precision) ml_exp2 = ML_Exp2(ml_exp2_args) exp2_t0_frac = ml_exp2.generate_scalar_scheme(t0_frac, inline_select=True) exp2_t0_frac.set_attributes(tag="exp2_t0_frac", debug=debug_multi) exp2_t0_int = ExponentInsertion(Conversion(t0_int, precision=int_precision), precision=self.precision, tag="exp2_t0_int") t1 = Select(t_cond, Constant(0, precision=self.precision), r) exp2_t1 = ml_exp2.generate_scalar_scheme(t1, inline_select=True) exp2_t1.set_attributes(tag="exp2_t1", debug=debug_multi) result_sign = Constant(1.0, precision=self.precision) # Select(n_is_odd, CopySign(vx, Constant(1.0, precision=self.precision)), 1) y_int = NearestInteger(vy, precision=self.precision) y_is_integer = Equal(y_int, vy) y_is_even = LogicalOr( # if y is a number (exc. inf) greater than 2**mantissa_size * 2, # then it is an integer multiple of 2 => even Abs(vy) >= 2**(self.precision.get_mantissa_size()+1), LogicalAnd( y_is_integer and Abs(vy) < 2**(self.precision.get_mantissa_size()+1), # we want to limit the modulo computation to an integer input Equal(Modulo(Conversion(y_int, precision=int_precision), 2), 0) ) ) y_is_odd = LogicalAnd( LogicalAnd( Abs(vy) < 2**(self.precision.get_mantissa_size()+1), y_is_integer ), Equal(Modulo(Conversion(y_int, precision=int_precision), 2), 1) ) # special cases management special_case_results = Statement( # x is sNaN OR y is sNaN ConditionBlock( LogicalOr(Test(vx, specifier=Test.IsSignalingNaN), Test(vy, specifier=Test.IsSignalingNaN)), Return(FP_QNaN(self.precision)) ), # pow(x, ±0) is 1 if x is not a signaling NaN ConditionBlock( Test(vy, specifier=Test.IsZero), Return(Constant(1.0, precision=self.precision)) ), # pow(±0, y) is ±∞ and signals the divideByZero exception for y an odd integer <0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(y_is_odd, vy < 0)), Return(Select(Test(vx, specifier=Test.IsPositiveZero), FP_PlusInfty(self.precision), FP_MinusInfty(self.precision))), ), # pow(±0, −∞) is +∞ with no exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_MinusInfty(self.precision)), ), # pow(±0, +∞) is +0 with no exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is ±0 for finite y>0 an odd integer ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(y_is_odd, vy > 0)), Return(vx), ), # pow(−1, ±∞) is 1 with no exception ConditionBlock( LogicalAnd(Equal(vx, -1), Test(vy, specifier=Test.IsInfty)), Return(Constant(1.0, precision=self.precision)), ), # pow(+1, y) is 1 for any y (even a quiet NaN) ConditionBlock( vx == 1, Return(Constant(1.0, precision=self.precision)), ), # pow(x, +∞) is +0 for −1<x<1 ConditionBlock( LogicalAnd(Abs(vx) < 1, Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusZero(self.precision)) ), # pow(x, +∞) is +∞ for x<−1 or for 1<x (including ±∞) ConditionBlock( LogicalAnd(Abs(vx) > 1, Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusInfty(self.precision)) ), # pow(x, −∞) is +∞ for −1<x<1 ConditionBlock( LogicalAnd(Abs(vx) < 1, Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_PlusInfty(self.precision)) ), # pow(x, −∞) is +0 for x<−1 or for 1<x (including ±∞) ConditionBlock( LogicalAnd(Abs(vx) > 1, Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_PlusZero(self.precision)) ), # pow(+∞, y) is +0 for a number y < 0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsPositiveInfty), vy < 0), Return(FP_PlusZero(self.precision)) ), # pow(+∞, y) is +∞ for a number y > 0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsPositiveInfty), vy > 0), Return(FP_PlusInfty(self.precision)) ), # pow(−∞, y) is −0 for finite y < 0 an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(y_is_odd, vy < 0)), Return(FP_MinusZero(self.precision)), ), # pow(−∞, y) is −∞ for finite y > 0 an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(y_is_odd, vy > 0)), Return(FP_MinusInfty(self.precision)), ), # pow(−∞, y) is +0 for finite y < 0 and not an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(LogicalNot(y_is_odd), vy < 0)), Return(FP_PlusZero(self.precision)), ), # pow(−∞, y) is +∞ for finite y > 0 and not an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(LogicalNot(y_is_odd), vy > 0)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is +∞ and signals the divideByZero exception for finite y<0 and not an odd integer # TODO: signal divideByZero exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(LogicalNot(y_is_odd), vy < 0)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is +0 for finite y>0 and not an odd integer ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(LogicalNot(y_is_odd), vy > 0)), Return(FP_PlusZero(self.precision)), ), ) # manage n=1 separately to avoid catastrophic propagation of errors # between log2 and exp2 to eventually compute the identity function # test-case #3 result = Statement( special_case_results, # fallback default cases Return(result_sign * exp2_t1 * exp2_t0_int * exp2_t0_frac)) return result
def solve_eval_error(self, gappa_init_approx, gappa_current_approx, div_approx, gappa_vx, gappa_vy, inv_iteration_list, div_iteration_list, seed_accuracy, seed_interval): """ compute the evaluation error of reciprocal approximation of (1 / gappa_vy) :param seed_accuracy: absolute error for seed value :type seed_accuracy: SollyaObject """ seed_var = Variable("seed", precision=self.precision, interval=seed_interval) cg_eval_error_copy_map = { gappa_init_approx.get_handle().get_node(): seed_var, gappa_vy.get_handle().get_node(): Variable("y", precision=self.precision, interval=Interval(1, 2)), gappa_vx.get_handle().get_node(): Variable("x", precision=self.precision, interval=Interval(1, 2)), } yerr_last = div_iteration_list[-1].yerr # copying cg_eval_error_copy_map to allow mutation during # optimise_scheme while keeping a clean copy for later use optimisation_copy_map = cg_eval_error_copy_map.copy() gappa_current_approx = self.optimise_scheme(gappa_current_approx, copy=optimisation_copy_map) div_approx = self.optimise_scheme(div_approx, copy=optimisation_copy_map) yerr_last = self.optimise_scheme(yerr_last, copy=optimisation_copy_map) yerr_last.get_handle().set_node(yerr_last) G1 = Constant(1, precision=ML_Exact) exact_recp = G1 / gappa_vy exact_recp.set_precision(ML_Exact) exact_recp.set_tag("exact_recp") recp_approx_error_goal = gappa_current_approx - exact_recp recp_approx_error_goal.set_attributes(precision=ML_Exact, tag="recp_approx_error_goal") gappacg = GappaCodeGenerator(self.processor, declare_cst=False, disable_debug=True) gappa_code = GappaCodeObject() exact_div = gappa_vx * exact_recp exact_div.set_attributes(precision=ML_Exact, tag="exact_div") div_approx_error_goal = div_approx - exact_div div_approx_error_goal.set_attributes(precision=ML_Exact, tag="div_approx_error_goal") bound_list = [op for op in cg_eval_error_copy_map] gappacg.add_goal(gappa_code, yerr_last) gappa_code = gappacg.get_interval_code( [recp_approx_error_goal, div_approx_error_goal], bound_list, cg_eval_error_copy_map, gappa_code=gappa_code, register_bound_hypothesis=False) for node in bound_list: gappacg.add_hypothesis(gappa_code, cg_eval_error_copy_map[node], cg_eval_error_copy_map[node].get_interval()) new_exact_recp_node = exact_recp.get_handle().get_node() new_exact_div_node = exact_div.get_handle().get_node() # adding specific hints for Newton-Raphson reciprocal iteration for nr in inv_iteration_list: nr.get_hint_rules(gappacg, gappa_code, new_exact_recp_node) for div_iter in div_iteration_list: div_iter.get_hint_rules(gappacg, gappa_code, new_exact_recp_node, new_exact_div_node) seed_wrt_exact = seed_var - new_exact_recp_node seed_wrt_exact.set_attributes(precision=ML_Exact, tag="seed_wrt_exact") gappacg.add_hypothesis(gappa_code, seed_wrt_exact, Interval(-seed_accuracy, seed_accuracy)) try: gappa_results = execute_gappa_script_extract( gappa_code.get(gappacg)) recp_eval_error = gappa_results["recp_approx_error_goal"] div_eval_error = gappa_results["div_approx_error_goal"] print("eval error(s): recp={}, div={}".format( recp_eval_error, div_eval_error)) except: print("error during gappa run") raise recp_eval_error = None div_eval_error = None return recp_eval_error, div_eval_error
def generate_scalar_scheme(self, vx, n): # fixing inputs' node tag vx.set_attributes(tag="x") n.set_attributes(tag="n") int_precision = self.precision.get_integer_format() # assuming x = m.2^e (m in [1, 2[) # n, positive or null integers # # rootn(x, n) = x^(1/n) # = exp(1/n * log(x)) # = 2^(1/n * log2(x)) # = 2^(1/n * (log2(m) + e)) # # approximation log2(m) # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) log_f = sollya.log(sollya.x) # /sollya.log(self.basis) use_reciprocal = False # non-scaled vx used to compute vx^1 unmodified_vx = vx is_subnormal = Test(vx, specifier=Test.IsSubnormal, tag="is_subnormal") exp_correction_factor = self.precision.get_mantissa_size() mantissa_factor = Constant(2**exp_correction_factor, tag="mantissa_factor") vx = Select(is_subnormal, vx * mantissa_factor, vx, tag="corrected_vx") m = MantissaExtraction(vx, tag="m", precision=self.precision) e = ExponentExtraction(vx, tag="e", precision=int_precision) e = Select(is_subnormal, e - exp_correction_factor, e, tag="corrected_e") ml_log_args = ML_GenericLog.get_default_args(precision=self.precision, basis=2) ml_log = ML_GenericLog(ml_log_args) log_table, log_table_tho, table_index_range = ml_log.generate_log_table( log_f, inv_approx_table) log_approx = ml_log.generate_reduced_log_split( Abs(m, precision=self.precision), log_f, inv_approx_table, log_table) # floating-point version of n n_f = Conversion(n, precision=self.precision, tag="n_f") inv_n = Division(Constant(1, precision=self.precision), n_f) log_approx = Select(Equal(vx, 0), FP_MinusInfty(self.precision), log_approx) log_approx.set_attributes(tag="log_approx", debug=debug_multi) if use_reciprocal: r = Multiplication(log_approx, inv_n, tag="r", debug=debug_multi) else: r = Division(log_approx, n_f, tag="r", debug=debug_multi) # e_n ~ e / n e_f = Conversion(e, precision=self.precision, tag="e_f") if use_reciprocal: e_n = Multiplication(e_f, inv_n, tag="e_n") else: e_n = Division(e_f, n_f, tag="e_n") error_e_n = FMA(e_n, -n_f, e_f, tag="error_e_n") e_n_int = NearestInteger(e_n, precision=self.precision, tag="e_n_int") pre_e_n_frac = e_n - e_n_int pre_e_n_frac.set_attributes(tag="pre_e_n_frac") e_n_frac = pre_e_n_frac + error_e_n * inv_n e_n_frac.set_attributes(tag="e_n_frac") ml_exp2_args = ML_Exp2.get_default_args(precision=self.precision) ml_exp2 = ML_Exp2(ml_exp2_args) exp2_r = ml_exp2.generate_scalar_scheme(r, inline_select=True) exp2_r.set_attributes(tag="exp2_r", debug=debug_multi) exp2_e_n_frac = ml_exp2.generate_scalar_scheme(e_n_frac, inline_select=True) exp2_e_n_frac.set_attributes(tag="exp2_e_n_frac", debug=debug_multi) exp2_e_n_int = ExponentInsertion(Conversion(e_n_int, precision=int_precision), precision=self.precision, tag="exp2_e_n_int") n_is_even = Equal(Modulo(n, 2), 0, tag="n_is_even", debug=debug_multi) n_is_odd = LogicalNot(n_is_even, tag="n_is_odd") result_sign = Select( n_is_odd, CopySign(vx, Constant(1.0, precision=self.precision)), 1) # managing n == -1 if self.expand_div: ml_division_args = ML_Division.get_default_args( precision=self.precision, input_formats=[self.precision] * 2) ml_division = ML_Division(ml_division_args) self.division_implementation = ml_division.implementation self.division_implementation.set_scheme( ml_division.generate_scheme()) ml_division_fct = self.division_implementation.get_function_object( ) else: ml_division_fct = Division # manage n=1 separately to avoid catastrophic propagation of errors # between log2 and exp2 to eventually compute the identity function # test-case #3 result = ConditionBlock( LogicalOr(LogicalOr(Test(vx, specifier=Test.IsNaN), Equal(n, 0)), LogicalAnd(n_is_even, vx < 0)), Return(FP_QNaN(self.precision)), Statement( ConditionBlock( Equal(n, -1, tag="n_is_mone"), #Return(Division(Constant(1, precision=self.precision), unmodified_vx, tag="div_res", precision=self.precision)), Return( ml_division_fct(Constant(1, precision=self.precision), unmodified_vx, tag="div_res", precision=self.precision)), ), ConditionBlock( # rootn( ±inf, n) is +∞ for even n< 0. Test(vx, specifier=Test.IsInfty), Statement( ConditionBlock( n < 0, #LogicalAnd(n_is_odd, n < 0), Return( Select(Test(vx, specifier=Test.IsPositiveInfty), Constant(FP_PlusZero(self.precision), precision=self.precision), Constant(FP_MinusZero(self.precision), precision=self.precision), precision=self.precision)), Return(vx), ), ), ), ConditionBlock( # rootn(±0, n) is ±∞ for odd n < 0. LogicalAnd(LogicalAnd(n_is_odd, n < 0), Equal(vx, 0), tag="n_is_odd_and_neg"), Return( Select(Test(vx, specifier=Test.IsPositiveZero), Constant(FP_PlusInfty(self.precision), precision=self.precision), Constant(FP_MinusInfty(self.precision), precision=self.precision), precision=self.precision)), ), ConditionBlock( # rootn( ±0, n) is +∞ for even n< 0. LogicalAnd(LogicalAnd(n_is_even, n < 0), Equal(vx, 0)), Return(FP_PlusInfty(self.precision))), ConditionBlock( # rootn(±0, n) is +0 for even n > 0. LogicalAnd(n_is_even, Equal(vx, 0)), Return(vx)), ConditionBlock( Equal(n, 1), Return(unmodified_vx), Return(result_sign * exp2_r * exp2_e_n_int * exp2_e_n_frac)))) return result
def generate_scheme(self): # declaring target and instantiating optimization engine precision_ptr = self.get_input_precision(0) index_format = self.get_input_precision(2) dst = self.implementation.add_input_variable("dst", precision_ptr) src = self.implementation.add_input_variable("src", precision_ptr) n = self.implementation.add_input_variable("len", index_format) i = Variable("i", precision=index_format, var_type=Variable.Local) CU1 = Constant(1, precision=index_format) CU0 = Constant(0, precision=index_format) inc = i + CU1 elt_input = TableLoad(src, i, precision=self.precision) local_exp = Variable("local_exp", precision=self.precision, var_type=Variable.Local) if self.use_libm_function: libm_exp_operator = FunctionOperator("expf", arity=1) libm_exp = FunctionObject("expf", [ML_Binary32], ML_Binary32, libm_exp_operator) elt_result = ReferenceAssign(local_exp, libm_exp(elt_input)) else: exponential_args = ML_Exponential.get_default_args( precision=self.precision, libm_compliant=False, debug=False, ) meta_exponential = ML_Exponential(exponential_args) exponential_scheme = meta_exponential.generate_scheme() elt_result = inline_function( exponential_scheme, local_exp, {meta_exponential.implementation.arg_list[0]: elt_input}, ) elt_acc = Variable("elt_acc", precision=self.precision, var_type=Variable.Local) exp_loop = Loop( ReferenceAssign(i, CU0), i < n, Statement(ReferenceAssign(local_exp, 0), elt_result, TableStore(local_exp, dst, i, precision=ML_Void), ReferenceAssign(elt_acc, elt_acc + local_exp), ReferenceAssign(i, i + CU1)), ) sum_rcp = Division(1, elt_acc, precision=self.precision, tag="sum_rcp", debug=debug_multi) div_loop = Loop( ReferenceAssign(i, CU0), i < n, Statement( TableStore(Multiplication( TableLoad(dst, i, precision=self.precision), sum_rcp), dst, i, precision=ML_Void), ReferenceAssign(i, inc)), ) main_scheme = Statement(ReferenceAssign(elt_acc, 0), exp_loop, sum_rcp, div_loop) return main_scheme
def generate_bench(self, processor, test_num=1000, unroll_factor=10): """ generate performance bench for self.op_class """ initial_inputs = [ Constant(random.uniform(inf(self.init_interval), sup(self.init_interval)), precision=precision) for i, precision in enumerate(self.input_precisions) ] var_inputs = [ Variable("var_%d" % i, precision=FormatAttributeWrapper(precision, ["volatile"]), var_type=Variable.Local) for i, precision in enumerate(self.input_precisions) ] printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s[%s] %%lld elts computed "\ "in %%lld cycles =>\\n %%.3f CPE \\n\"" % ( self.bench_name, self.output_precision.get_display_format() ), 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2), 4: FO_Arg(3) }, void_function=True ) printf_timing_function = FunctionObject( "printf", [self.output_precision, ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) void_function_op = FunctionOperator("(void)", arity=1, void_function=True) void_function = FunctionObject("(void)", [self.output_precision], ML_Void, void_function_op) # initialization of operation inputs init_assign = metaop.Statement() for var_input, init_value in zip(var_inputs, initial_inputs): init_assign.push(ReferenceAssign(var_input, init_value)) # test loop loop_i = Variable("i", precision=ML_Int64, var_type=Variable.Local) test_num_cst = Constant(test_num / unroll_factor, precision=ML_Int64, tag="test_num") # Goal build a chain of dependant operation to measure # elementary operation latency local_inputs = tuple(var_inputs) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) for i in range(unroll_factor - 1): local_inputs = tuple([local_result] + var_inputs[1:]) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) # renormalisation local_result = self.renorm_function(local_result) # variable assignation to build dependency chain var_assign = Statement() var_assign.push(ReferenceAssign(var_inputs[0], local_result)) final_value = var_inputs[0] # loop increment value loop_increment = 1 test_loop = Loop( ReferenceAssign(loop_i, Constant(0, precision=ML_Int32)), loop_i < test_num_cst, Statement(var_assign, ReferenceAssign(loop_i, loop_i + loop_increment)), ) # bench scheme test_scheme = Statement( ReferenceAssign(timer, processor.get_current_timestamp()), init_assign, test_loop, ReferenceAssign( timer, Subtraction(processor.get_current_timestamp(), timer, precision=ML_Int64)), # prevent intermediary variable simplification void_function(final_value), printf_timing_function( final_value, Constant(test_num, precision=ML_Int64), timer, Division(Conversion(timer, precision=ML_Binary64), Constant(test_num, precision=ML_Binary64), precision=ML_Binary64)) # ,Return(Constant(0, precision = ML_Int32)) ) return test_scheme
def get_array_test_wrapper(self, test_num, tested_function, table_size_offset_array, input_tables, output_array, acc_num, post_statement_generator, NUM_INPUT_ARRAY=1): """ generate a test loop for multi-array tests @param test_num number of elementary array tests to be executed @param tested_function FunctionObject to be tested @param table_size_offset_array ML_NewTable object containing (table-size, offset) pairs for multi-array testing @param input_table ML_NewTable containing multi-array test inputs @param output_table ML_NewTable containing multi-array test outputs @param post_statement_generator is generator used to generate a statement executed at the end of the test of one of the arrays of the multi-test. It expects 6 arguments: (input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id) @param printf_function FunctionObject to print error case """ test_id = Variable("test_id", precision=ML_Int32, var_type=Variable.Local) test_num_cst = Constant(test_num, precision=ML_Int32, tag="test_num") array_len = Variable("len", precision=ML_UInt32, var_type=Variable.Local) array_offset = TableLoad(table_size_offset_array, test_id, 1) def pointer_add(table_addr, offset): pointer_format = table_addr.get_precision_as_pointer_format() return Addition(table_addr, offset, precision=pointer_format) array_inputs = tuple( pointer_add(input_tables[in_id], array_offset) for in_id in range(NUM_INPUT_ARRAY)) function_call = tested_function( *((pointer_add(output_array, array_offset), ) + array_inputs + (array_len, ))) post_statement = post_statement_generator(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id) loop_increment = 1 test_loop = Loop( ReferenceAssign(test_id, Constant(0, precision=ML_Int32)), test_id < test_num_cst, Statement( ReferenceAssign(array_len, TableLoad(table_size_offset_array, test_id, 0)), function_call, post_statement, ReferenceAssign( acc_num, acc_num + Conversion(array_len, precision=acc_num.precision)), ReferenceAssign(test_id, test_id + loop_increment), ), ) test_statement = Statement() # adding functional test_loop to test statement test_statement.add(test_loop) return test_statement
Log.report(LOG_PASS_INFO, "Registering generate Basic-Blocks pass") Pass.register(Pass_GenerateBasicBlock) # registering ssa translation pass Log.report(LOG_PASS_INFO, "Registering ssa translation pass") Pass.register(Pass_SSATranslate) # registering basic-block simplification pass Log.report(LOG_PASS_INFO, "Registering basic-block simplification pass") Pass.register(Pass_BBSimplification) if __name__ == "__main__": bb_root = BasicBlock(tag="bb_root") bb_1 = BasicBlock(tag="bb_1") bb_2 = BasicBlock(tag="bb_2") bb_3 = BasicBlock(tag="bb_3") var_x = Variable("x", precision=None) var_y = Variable("y", precision=None) bb_root.add(ReferenceAssign(var_x, 1)) bb_root.add(ReferenceAssign(var_y, 2)) bb_root.add(ConditionalBranch(var_x > var_y, bb_1, bb_2)) bb_1.add(ReferenceAssign(var_x, 2)) bb_1.add(UnconditionalBranch(bb_3)) bb_2.add(ReferenceAssign(var_y, 3)) bb_2.add(UnconditionalBranch(bb_3)) bb_3.add(ReferenceAssign(var_y, var_x))
def generate_scheme(self): # declaring target and instantiating optimization engine precision_ptr = self.get_input_precision(0) index_format = self.get_input_precision(2) multi_elt_num = self.multi_elt_num dst = self.implementation.add_input_variable("dst", precision_ptr) src = self.implementation.add_input_variable("src", precision_ptr) n = self.implementation.add_input_variable("len", index_format) i = Variable("i", precision=index_format, var_type=Variable.Local) CU0 = Constant(0, precision=index_format) element_format = self.precision self.function_list = [] if multi_elt_num > 1: element_format = VECTOR_TYPE_MAP[self.precision][multi_elt_num] elt_input = TableLoad(src, i, precision=element_format) local_exp = Variable("local_exp", precision=element_format, var_type=Variable.Local) if self.use_libm_function: libm_fct_operator = FunctionOperator(self.use_libm_function, arity=1) libm_fct = FunctionObject(self.use_libm_function, [ML_Binary32], ML_Binary32, libm_fct_operator) if multi_elt_num > 1: result_list = [ libm_fct( VectorElementSelection(elt_input, Constant(elt_id, precision=ML_Integer), precision=self.precision)) for elt_id in range(multi_elt_num) ] result = VectorAssembling(*result_list, precision=element_format) else: result = libm_fct(elt_input) elt_result = ReferenceAssign(local_exp, result) else: if multi_elt_num > 1: scalar_result = Variable("scalar_result", precision=self.precision, var_type=Variable.Local) fct_ctor_args = self.function_ctor.get_default_args( precision=self.precision, libm_compliant=False, ) meta_function = self.function_ctor(fct_ctor_args) exponential_scheme = meta_function.generate_scheme() # instanciating required passes for typing pass_inst_abstract_prec = PassInstantiateAbstractPrecision( self.processor) pass_inst_prec = PassInstantiatePrecision( self.processor, default_precision=None) # exectuting format instanciation passes on optree exponential_scheme = pass_inst_abstract_prec.execute_on_optree( exponential_scheme) exponential_scheme = pass_inst_prec.execute_on_optree( exponential_scheme) vectorizer = StaticVectorizer() # extracting scalar argument from meta_exponential meta function scalar_input = meta_function.implementation.arg_list[0] # vectorize scalar scheme vector_result, vec_arg_list, vector_scheme, scalar_callback, scalar_callback_fct = vectorize_function_scheme( vectorizer, self.get_main_code_object(), exponential_scheme, element_format.get_scalar_format(), [scalar_input], multi_elt_num) elt_result = inline_function(vector_scheme, vector_result, {vec_arg_list[0]: elt_input}) local_exp = vector_result self.function_list.append(scalar_callback_fct) libm_fct = scalar_callback else: scalar_input = elt_input scalar_result = local_exp elt_result = generate_inline_fct_scheme( self.function_ctor, scalar_result, [scalar_input], { "precision": self.precision, "libm_compliant": False }) CU1 = Constant(1, precision=index_format) local_exp_init_value = Constant(0, precision=self.precision) if multi_elt_num > 1: local_exp_init_value = Constant([0] * multi_elt_num, precision=element_format) remain_n = Modulo(n, multi_elt_num, precision=index_format) iter_n = n - remain_n CU_ELTNUM = Constant(multi_elt_num, precision=index_format) inc = i + CU_ELTNUM else: remain_n = None iter_n = n inc = i + CU1 # main loop processing multi_elt_num element(s) per iteration main_loop = Loop( ReferenceAssign(i, CU0), i < iter_n, Statement(ReferenceAssign(local_exp, local_exp_init_value), elt_result, TableStore(local_exp, dst, i, precision=ML_Void), ReferenceAssign(i, inc)), ) # epilog to process remaining item (when the length is not a multiple # of multi_elt_num) if not remain_n is None: # TODO/FIXME: try alternative method for processing epilog # by using full vector length and mask epilog_loop = Loop( Statement(), i < n, Statement( TableStore(libm_fct( TableLoad(src, i, precision=self.precision)), dst, i, precision=ML_Void), ReferenceAssign(i, i + CU1), )) main_loop = Statement(main_loop, epilog_loop) return main_loop
def generate_bench_wrapper(self, test_num=1, loop_num=100000, test_ranges=[Interval(-1.0, 1.0)], debug=False): # interval where the array lenght is chosen from (randomly) index_range = self.test_index_range auto_test = CodeFunction("bench_wrapper", output_format=ML_Binary64) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) output_precision = FormatAttributeWrapper(self.precision, ["volatile"]) test_total = test_num # number of arrays expected as inputs for tested_function NUM_INPUT_ARRAY = 1 # position of the input array in tested_function operands (generally # equals to 1 as to 0-th input is often the destination array) INPUT_INDEX_OFFSET = 1 # concatenating standard test array at the beginning of randomly # generated array TABLE_SIZE_VALUES = [ len(std_table) for std_table in self.standard_test_cases ] + [ random.randrange(index_range[0], index_range[1] + 1) for i in range(test_num) ] OFFSET_VALUES = [sum(TABLE_SIZE_VALUES[:i]) for i in range(test_total)] table_size_offset_array = generate_2d_table( test_total, 2, ML_UInt32, self.uniquify_name("table_size_array"), value_gen=(lambda row_id: (TABLE_SIZE_VALUES[row_id], OFFSET_VALUES[row_id]))) INPUT_ARRAY_SIZE = sum(TABLE_SIZE_VALUES) # TODO/FIXME: implement proper input range depending on input index # assuming a single input array input_precisions = [self.get_input_precision(1).get_data_precision()] rng_map = [ get_precision_rng(precision, inf(test_range), sup(test_range)) for precision, test_range in zip(input_precisions, test_ranges) ] # generated table of inputs input_tables = [ generate_1d_table( INPUT_ARRAY_SIZE, self.get_input_precision(INPUT_INDEX_OFFSET + table_id).get_data_precision(), self.uniquify_name("input_table_arg%d" % table_id), value_gen=( lambda _: input_precisions[table_id].round_sollya_object( rng_map[table_id].get_new_value(), sollya.RN))) for table_id in range(NUM_INPUT_ARRAY) ] # generate output_array output_array = generate_1d_table( INPUT_ARRAY_SIZE, output_precision, self.uniquify_name("output_array"), #value_gen=(lambda _: FP_QNaN(self.precision)) value_gen=(lambda _: None), const=False, empty=True) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) def empty_post_statement_gen(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): return Statement() test_loop = self.get_array_test_wrapper(test_total, tested_function, table_size_offset_array, input_tables, output_array, acc_num, empty_post_statement_gen) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s %%\"PRIi64\" elts computed in %%\"PRIi64\" nanoseconds => %%.3f CPE \\n\"" % function_name, 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2) }, void_function=True) printf_timing_function = FunctionObject( "printf", [ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) vj = Variable("j", precision=ML_Int32, var_type=Variable.Local) loop_num_cst = Constant(loop_num, precision=ML_Int32, tag="loop_num") loop_increment = 1 # bench measure of clock per element cpe_measure = Division( Conversion(timer, precision=ML_Binary64), Conversion(acc_num, precision=ML_Binary64), precision=ML_Binary64, tag="cpe_measure", ) # common test scheme between scalar and vector functions test_scheme = Statement( self.processor.get_init_timestamp(), ReferenceAssign(timer, self.processor.get_current_timestamp()), ReferenceAssign(acc_num, 0), Loop( ReferenceAssign(vj, Constant(0, precision=ML_Int32)), vj < loop_num_cst, Statement(test_loop, ReferenceAssign(vj, vj + loop_increment))), ReferenceAssign( timer, Subtraction(self.processor.get_current_timestamp(), timer, precision=ML_Int64)), printf_timing_function( Conversion(acc_num, precision=ML_Int64), timer, cpe_measure, ), Return(cpe_measure), # Return(Constant(0, precision = ML_Int32)) ) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", ML_Binary32) # declaring specific interval for input variable <x> vx.set_interval(Interval(-1, 1)) # declaring free Variable y vy = Variable("y", precision=ML_Exact) # declaring expression with vx variable expr = vx * vx - vx * 2 # declaring second expression with vx variable expr2 = vx * vx - vx # optimizing expressions (defining every unknown precision as the # default one + some optimization as FMA merging if enabled) opt_expr = self.optimise_scheme(expr) opt_expr2 = self.optimise_scheme(expr2) # setting specific tag name for optimized expression (to be extracted # from gappa script ) opt_expr.set_tag("goal") opt_expr2.set_tag("new_goal") # defining default goal to gappa execution gappa_goal = opt_expr # declaring EXACT expression to be used as hint in Gappa's script annotation = self.opt_engine.exactify(vy * (1 / vy)) # the dict var_bound is used to limit the DAG part to be explored when # generating the gappa script, each pair (key, value), indicate a node to # stop at <key> # and a node to replace it with during the generation: <node>, # <node> must be a Variable instance with defined interval # vx.get_handle().get_node() is used to retrieve the node instanciating # the abstract node <vx> after the call to self.optimise_scheme var_bound = { vx.get_handle().get_node(): Variable("x", precision=ML_Binary32, interval=vx.get_interval()) } # generating gappa code to determine interval for <opt_expr> # NOTES: var_bound must be converted from an iterator to a list to avoid # implicit modification by get_interval_code gappa_code = self.gappa_engine.get_interval_code( [opt_expr], list(var_bound.keys()), var_bound) # add a manual hint to the gappa code # which state thtat vy * (1 / vy) -> 1 { vy <> 0 }; self.gappa_engine.add_hint( gappa_code, annotation, Constant(1, precision=ML_Exact), Comparison(vy, Constant(0, precision=ML_Integer), specifier=Comparison.NotEqual, precision=ML_Bool)) # adding the expression <opt_expr2> as an extra goal in the gappa script self.gappa_engine.add_goal(gappa_code, opt_expr2) # executing gappa on the script generated from <gappa_code> # extract the result and store them into <gappa_result> # which is a dict indexed by the goals' tag if is_gappa_installed(): gappa_result = execute_gappa_script_extract( gappa_code.get(self.gappa_engine)) Log.report(Log.Info, "eval error: ", gappa_result["new_goal"]) else: Log.report( Log.Warning, "gappa was not installed: unable to check execute_gappa_script_extract" ) # dummy scheme to make functionnal code generation scheme = Statement(Return(vx)) return scheme