def round_to_odd(v, p): """ round the SollyaObject @p v to precision @p p using rounding-to-odd mode :param v: value to round :type v: SollyaObject (numerical value) :param p: targe precision :type p: SollyaObject (precision) :return: rounded value (using round-to-odd) :rtype: SollyaObject """ r_up = sollya.round(v, p, sollya.RU) r_down = sollya.round(v, p, sollya.RD) # determining parity of r_up # if r_up can be expressed exactly on (p-1) bits it means # r_up's LSB is 0 => even mantissa if p == sollya.binary64: pm1 = 52 elif p == sollya.binary32: pm1 == 23 else: pm1 = p - 1 r_up_even = (r_up == sollya.round(v, pm1, sollya.RU)) if r_up_even: return r_down else: return r_up
def dirty_multi_node_expand(node, precision, mem_map=None, fma=True): """ Dirty expand node into Hi and Lo part, storing already processed temporary values in mem_map """ mem_map = mem_map or {} if node in mem_map: return mem_map[node] elif isinstance(node, Constant): value = node.get_value() value_hi = sollya.round(value, precision.sollya_object, sollya.RN) value_lo = sollya.round(value - value_hi, precision.sollya_object, sollya.RN) ch = Constant(value_hi, tag=node.get_tag() + "hi", precision=precision) cl = Constant(value_lo, tag=node.get_tag() + "lo", precision=precision) if value_lo != 0 else None if cl is None: Log.report(Log.Info, "simplified constant") result = ch, cl mem_map[node] = result return result else: # Case of Addition or Multiplication nodes: # 1. retrieve inputs # 2. dirty convert inputs recursively # 3. forward to the right metamacro assert isinstance(node, Addition) or isinstance(node, Multiplication) lhs = node.get_input(0) rhs = node.get_input(1) op1h, op1l = dirty_multi_node_expand(lhs, precision, mem_map, fma) op2h, op2l = dirty_multi_node_expand(rhs, precision, mem_map, fma) if isinstance(node, Addition): result = Add222(op1h, op1l, op2h, op2l) \ if op1l is not None and op2l is not None \ else Add212(op1h, op2h, op2l) \ if op1l is None and op2l is not None \ else Add212(op2h, op1h, op1l) \ if op2l is None and op1l is not None \ else Add211(op1h, op2h) mem_map[node] = result return result elif isinstance(node, Multiplication): result = Mul222(op1h, op1l, op2h, op2l, fma=fma) \ if op1l is not None and op2l is not None \ else Mul212(op1h, op2h, op2l, fma=fma) \ if op1l is None and op2l is not None \ else Mul212(op2h, op1h, op1l, fma=fma) \ if op2l is None and op1l is not None \ else Mul211(op1h, op2h, fma=fma) mem_map[node] = result return result
def computeConstantFormat(self, c): if c == 0: # default to double precision return self.get_format_from_accuracy(53, eps_target=0, interval=Interval(c), exact=True) else: accuracy = 0 cN = c limb_num = 0 while cN != 0 and limb_num < self.LIMB_NUM_THRESHOLD: cR = sollya.round(cN, sollya.binary64, sollya.RN) cN = cN - cR accuracy += 53 limb_num += 1 if accuracy > 159 or limb_num > self.LIMB_NUM_THRESHOLD: eps_target = S2**--accuracy accuracy = 159 else: eps_target = 0 if cN == 0 else S2**-accuracy return self.get_format_from_accuracy(accuracy, eps_target=eps_target, interval=Interval(c), exact=True)
def numeric_emulate(self, io_map): vx = io_map["x"] vy = io_map["y"] result = {} base_format = self.precision.get_base_format() result["vr_out"] = sollya.round(vx + vy, base_format.get_sollya_object(), sollya.RN) return result
def numeric_emulate(self, io_map): vx = io_map["x"] result = {} result["vr_out"] = sollya.round(1.0 / vx, self.precision.get_sollya_object(), sollya.RN) return result
def get_integer_coding(self, value, language=C_Code): if FP_SpecialValue.is_special_value(value): return self.get_special_value_coding(value, language) elif value == ml_infty: return self.get_special_value_coding(FP_PlusInfty(self), language) elif value == -ml_infty: return self.get_special_value_coding(FP_MinusInfty(self), language) else: value = sollya.round(value, self.get_sollya_object(), sollya.RN) # FIXME: managing negative zero sign = int(1 if value < 0 else 0) value = abs(value) if value == 0.0: Log.report(Log.Warning, "+0.0 forced during get_integer_coding conversion") exp_biased = 0 mant = 0 else: exp = int(sollya.floor(sollya.log2(value))) exp_biased = int(exp - self.get_bias()) if exp < self.get_emin_normal(): exp_biased = 0 mant = int((value / S2**self.get_emin_subnormal())) else: mant = int( (value / S2**exp - 1.0) * (S2**self.get_field_size())) return mant | (exp_biased << self.get_field_size()) | ( sign << (self.get_field_size() + self.get_exponent_size()))
def sollyaConstraint(self, margin): """ Parameter: - margin: margin we add to the band (not in dB) margin is negative when the band is reduced Returns a dictonary for sollya checkModulusFilterInSpecification """ # deal with Sollya Gabarit.readyToRunWithSollya() w1 = 2*sollya.SollyaObject(self._F1)/self._Fs w2 = 2*sollya.SollyaObject(self._F2)/self._Fs if self._F2 else 1 # F2==None -> F2=Fs/2, so w2=1 margin = sollya.SollyaObject(margin) if self.isPassBand: # pass band betaInf = 10 ** (sollya.SollyaObject(self._passGains[0]) / 20) - margin betaSup = 10 ** (sollya.SollyaObject(self._passGains[1]) / 20) + margin else: # stop band betaInf = 0 betaSup = 10 ** (sollya.SollyaObject(self._stopGain) / 20) + margin assert(betaInf < betaSup) return {"Omega": sollya.Interval(w1, w2), "omegaFactor": sollya.pi, "betaInf": sollya.round(betaInf, 53, sollya.RU), "betaSup": sollya.round(betaSup, 53, sollya.RD)}
def numeric_emulate(self, io_map): vx = io_map["x"] vy = io_map["y"] result = {} print "vx, vy" print vx, vx.__class__ print vy, vy.__class__ result["vr_out"] = sollya.round(vx + vy, self.precision.get_sollya_object(), sollya.RN) return result
def numeric_emulate(self, io_map): vx = io_map["x"] vy = io_map["y"] vz = io_map["z"] result = {} result["vr_out"] = sollya.round(vx * vy + vz, self.precision.get_sollya_object(), sollya.RN) return result
def get_cst(self, cst_value, language = C_Code): tmp_cst = cst_value field_str_list = [] for field_name, field_format in zip(self.c_field_list, self.field_format_list): # FIXME, round is only valid for double_double or triple_double stype format field_value = sollya.round(tmp_cst, field_format.sollya_object, RN) tmp_cst = cst_value - field_value field_str_list.append(".%s = %s" % (field_name, field_format.get_c_cst(field_value))) return "{%s}" % (", ".join(field_str_list))
def numeric_emulate(self, x): """ numeric emulation """ # extracting mantissa from x # abs_x = abs(x) # mantissa = abs_x / S2**sollya.floor(sollya.log2(abs_x)) # index = sollya.floor((mantissa - 1.0) * 2**8) # result = sollya.round(1/sollya.sqrt(1.0 + index * S2**-8), 9, sollya.RN) result = sollya.round(1 / sollya.sqrt(x), 9, sollya.RN) return result
def random_fp_value(emin=-126, emax=127, precision=sollya.binary32): """ Dummy generation for random binary32/fp32 value :return: random fp32 number :rtype: SollyaObject """ # FIXME: almost real, python's random.random() returns a number in [0, 1] # (possibly internally mapped) to a double, which is not really a good # way to generate a proper mantissa real_value = random.randrange(-1, 2, 2) * S2**random.randrange( emin, emax + 1) * (1.0 + random.random()) return sollya.round(real_value, precision, sollya.RN)
def generate_log_table(self, log_f, inv_approx_table): """ generate 2 tables: log_table[i] = 2-word unevaluated sum approximation of log_f(inv_approx_table[i]) log_table_tho[i] = 2-word unevaluated sum approximation of log_f(2*inv_approx_table[i]) """ sollya_precision = self.get_input_precision().get_sollya_object() # table creation table_index_size = inv_approx_table.index_size table_index_range = range(1, 2**table_index_size) log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, const=True) log_table_tho = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, const=True) log_table[0][0] = 0.0 log_table[0][1] = 0.0 log_table_tho[0][0] = 0.0 log_table_tho[0][1] = 0.0 hi_size = self.precision.get_field_size() - ( self.precision.get_exponent_size() + 1) for i in table_index_range: inv_value = inv_approx_table[i] value_high = round(log_f(inv_value), hi_size, sollya.RN) value_low = round( log_f(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low inv_value_tho = S2 * inv_approx_table[i] value_high_tho = round(log_f(inv_value_tho), hi_size, sollya.RN) value_low_tho = round( log_f(inv_value_tho) - value_high_tho, sollya_precision, sollya.RN) log_table_tho[i][0] = value_high_tho log_table_tho[i][1] = value_low_tho return log_table, log_table_tho, table_index_range
def findMinimumMargin(self, tf, initMargin=0): Gabarit.readyToRunWithSollya() margin = sollya.round(initMargin, 53, sollya.RU) deltaMargin = -infty gPass = False nbIter = 0 while (not gPass) and (nbIter < 25): nbIter += 1 # check if margin gPass, res = self.check_dTF(tf, margin=margin) if not gPass: oldDeltaMargin = deltaMargin # find the maximum margin we should apply, according to the results deltaMargin = findMaxIssue(res) if deltaMargin == 0: deltaMargin += 10**(1e-3 + self.maxGain()/20) - 10**(self.maxGain()/20) #print('deltaMargin='+str(deltaMargin)) #print('margin='+str(margin)) # check if we have something to improve # check if the margin decrease # if the old margin is lower than the new margin, and it is not the first iteration if oldDeltaMargin <= deltaMargin and oldDeltaMargin != -infty and margin != 0: print(("deltaMargin does not decrease:\n old=%s\n new=%s") % (oldDeltaMargin, deltaMargin)) # deltaMargin *=2 # raise ValueError("deltaMargin does not decrease") # increase the margin # margin += deltaMargin margin = sollya.round(deltaMargin+margin, 53, sollya.RU) return margin
def numeric_emulate(self, io_map): vx = io_map["x"] rnd_mode_i = io_map["rnd_mode"] rnd_mode = { 0: sollya.RN, 1: sollya.RU, 2: sollya.RD, 3: sollya.RZ }[rnd_mode_i] result = {} result["vr_out"] = sollya.round(self.function(vx), self.precision.get_frac_size(), rnd_mode) return result
def simulate_rounded(self, u, prec=53): # suppose u is a mpmath matrix xk = mpmath.mp.zeros(self._n, 1) # xk = mpf_matrix_to_sollya(xk)[0] nSimulations = u.shape[1] yk = mpmath.mp.zeros(self._p, nSimulations) yk_sollya = mpf_matrix_to_sollya(yk)[0] for i in range(0, nSimulations): xkp1 = mpf_matrix_fmul(self._A, xk) xkp1 = mpf_matrix_fadd(xkp1, mpf_matrix_fmul(self._B, u[:, i])) yk[:, i] = mpf_matrix_fmul(self._C, xk) yk[:, i] = mpf_matrix_fadd(yk[:, i], mpf_matrix_fmul(self._D, u[:, i])) xk = mpmath.matrix([float(sollya.round(mpf_matrix_to_sollya(xkp1)[0][j], prec, sollya.RN)) for j in range(0, self._n)]) yk_sollya[i] = sollya.round(mpf_matrix_to_sollya(yk[:, i])[0][0], prec, sollya.RN) return yk_sollya
def roundConstant(self, c, epsTarget): """ Rounds a given coefficient c into a format that guarantees that the rounding error is less than epsTarget. The function does not return the retained format but the rounded number. epsTarget is a positive OR ZERO number. If epsTarget is zero, the function is supposed to check whether there exists a format such that the constant can be represented exactly. The function returns a structure with at least two fields * .okay indicating that the rounding was able to be performed * .c the rounded constant """ if epsTarget >= 0: if c == 0: # 0 value can always fit exactly in a format return c else: cR = sollya.round(c, sollya.binary64, sollya.RN) limb_num = 1 while abs(cR / c - 1 ) > epsTarget and limb_num < self.LIMB_NUM_THRESHOLD: cN = sollya.round(c - cR, sollya.binary64, sollya.RN) cR += cN limb_num += 1 if limb_num > self.LIMB_NUM_THRESHOLD or abs(cR / c - 1) > epsTarget: return None else: return cR else: return None
def dp4_cr(a_v, b_v): """ Correctly rounded 4D dot product :param a_v: left-hand-side vector :type a_v: list(SollyaObject) :param b_v: right-hand-side vector :type b_v: list(SollyaObject) :return: correctly rounded 4D dot-product :rtype: SollyaObject """ prod_v = [a * b for a, b in zip(a_v, b_v)] prod_v.sort() acc = 0 for p in prod_v: acc = round_to_odd(p + acc, 53) return sollya.round(acc, sollya.binary32, sollya.RN)
def numeric_emulate(self, x): index = int(sollya.nearestint(x)) % 16 table_value = index >> 1 add_xx = sollya.round(x + x, self.precision.get_sollya_object(), sollya.RN) mult = sollya.round(add_xx * x, self.precision.get_sollya_object(), sollya.RN) cst = sollya.round(1.1, self.precision.get_sollya_object(), sollya.RN) return sollya.round( table_value * sollya.round( sollya.round(cst * mult, self.precision.get_sollya_object(), sollya.RN) - add_xx , self.precision.get_sollya_object(), sollya.RN), self.precision.get_sollya_object(), sollya.RN )
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) index_size = 3 approx_interval = Interval(0.0, 2**-index_size) error_goal_approx = 2**-(self.precision.get_precision()) int_precision = { ML_Binary32: ML_Int32, ML_Binary64: ML_Int64 }[self.precision] vx_int = Floor(vx * 2**index_size, precision=self.precision, tag="vx_int", debug=debug_multi) vx_frac = vx - (vx_int * 2**-index_size) vx_frac.set_attributes(tag="vx_frac", debug=debug_multi, unbreakable=True) poly_degree = sup( guessdegree(2**(sollya.x), approx_interval, error_goal_approx)) + 1 precision_list = [1] + [self.precision] * (poly_degree) vx_integer = Conversion(vx_int, precision=int_precision, tag="vx_integer", debug=debug_multi) vx_int_hi = BitLogicRightShift(vx_integer, Constant(index_size), tag="vx_int_hi", debug=debug_multi) vx_int_lo = Modulo(vx_integer, 2**index_size, tag="vx_int_lo", debug=debug_multi) pow_exp = ExponentInsertion(Conversion(vx_int_hi, precision=int_precision), precision=self.precision, tag="pow_exp", debug=debug_multi) exp2_table = ML_Table(dimensions=[2 * 2**index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i exp2_value = SollyaObject(2)**((input_value) * 2**-index_size) hi_value = round(exp2_value, self.precision.get_sollya_object(), RN) lo_value = round(exp2_value - hi_value, self.precision.get_sollya_object(), RN) exp2_table[i][0] = lo_value exp2_table[i][1] = hi_value error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( 2**(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) print "poly_approx_error: ", poly_approx_error, float( log2(poly_approx_error)) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly = polynomial_scheme_builder(poly_object.sub_poly(start_index=1), vx_frac, unified_precision=self.precision) poly.set_attributes(tag="poly", debug=debug_multi) table_index = Addition(vx_int_lo, Constant(2**index_size, precision=int_precision), precision=int_precision, tag="table_index", debug=debug_multi) lo_value_load = TableLoad(exp2_table, table_index, 0, tag="lo_value_load", debug=debug_multi) hi_value_load = TableLoad(exp2_table, table_index, 1, tag="hi_value_load", debug=debug_multi) result = (hi_value_load + (hi_value_load * poly + (lo_value_load + lo_value_load * poly))) * pow_exp ov_flag = Comparison(vx_int_hi, Constant(self.precision.get_emax(), precision=self.precision), specifier=Comparison.Greater) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = Statement( Return(Select(ov_flag, FP_PlusInfty(self.precision), result))) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.get_input_precision()) sollya_precision = self.get_input_precision().get_sollya_object() # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) # testing special value inputs test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") # if input is a signaling NaN, raise an invalid exception and returns # a quiet NaN return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = self.precision.get_integer_format() # log2(vx) # r = vx_mant # e = vx_exp # vx reduced to r in [1, 2[ # log2(vx) = log2(r * 2^e) # = log2(r) + e # ## log2(r) is approximated by # log2(r) = log2(inv_seed(r) * r / inv_seed(r) # = log2(inv_seed(r) * r) - log2(inv_seed(r)) # inv_seed(r) in ]1/2, 1] => log2(inv_seed(r)) in ]-1, 0] # # inv_seed(r) * r ~ 1 # we can easily tabulate -log2(inv_seed(r)) # # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("inv_table")) # value for index 0 is set to 0.0 log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 #inv_value = (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 #print inv_approx_table[i][0], inv_value inv_value = inv_approx_table[i][0] value_high_bitsize = self.precision.get_field_size() - ( self.precision.get_exponent_size() + 1) value_high = round(log2(inv_value), value_high_bitsize, sollya.RN) value_low = round( log2(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", precision=self.precision, debug=debug_lftolx) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debugd) # The main table is indexed by the 7 most significant bits # of the mantissa table_index = inv_approx_table.index_function(_vx_mant) table_index.set_attributes(tag="table_index", debug=debuglld) # argument reduction # Using AND -2 to exclude LSB set to 1 for Newton-Raphson convergence # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(DivisionSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_lftolx, silent=True), precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) _red_vx = FMA(arg_red_index, _vx_mant, -1.0) _red_vx.set_attributes(tag="_red_vx", debug=debug_lftolx) inv_err = S2**-inv_approx_table.index_size red_interval = Interval(1 - inv_err, 1 + inv_err) # return in case of standard (non-special) input _log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) _log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) Log.report(Log.Verbose, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log2(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() * 1.1))) + 1 sollya.settings.display = sollya.hexadecimal global_poly_object, approx_error = Polynomial.build_from_approximation_with_error( log2(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute, error_function=lambda p, f, ai, mod, t: sollya.dirtyinfnorm( p - f, ai)) Log.report( Log.Info, "poly_degree={}, approx_error={}".format( poly_degree, approx_error)) poly_object = global_poly_object.sub_poly(start_index=1, offset=1) #poly_object = global_poly_object.sub_poly(start_index=0,offset=0) Attributes.set_default_silent(True) Attributes.set_default_rounding_mode(ML_RoundToNearest) Log.report(Log.Verbose, "generating polynomial evaluation scheme") pre_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly = FMA(pre_poly, _red_vx, global_poly_object.get_cst_coeff(0, self.precision)) _poly.set_attributes(tag="poly", debug=debug_lftolx) Log.report( Log.Verbose, "sollya global_poly_object: {}".format( global_poly_object.get_sollya_object())) Log.report( Log.Verbose, "sollya poly_object: {}".format( poly_object.get_sollya_object())) corr_exp = _vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() pre_result = -_log_inv_hi + (_red_vx * _poly + (-_log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = Conversion(corr_exp, precision=self.precision) exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_hex", debug=debug_lftolx) _result = exact_log2_hi_exp + pre_result return _result, _poly, _log_inv_lo, _log_inv_hi, _red_vx result, poly, log_inv_lo, log_inv_hi, red_vx = compute_log(vx) result.set_attributes(tag="result", debug=debug_lftolx) # specific input value predicate neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="vx_snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="vx_inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debugd, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debugd, likely=False) vx_one = Equal(vx, 1.0, tag="vx_one", likely=False, debug=debugd) # Specific specific for the case exp == -1 # log2(x) = log2(m) - 1 # # as m in [1, 2[, log2(m) in [0, 1[ # if r is close to 2, a catastrophic cancellation can occur # # r = seed(m) # log2(x) = log2(seed(m) * m / seed(m)) - 1 # = log2(seed(m) * m) - log2(seed(m)) - 1 # # for m really close to 2 => seed(m) = 0.5 # => log2(x) = log2(0.5 * m) # = result_exp_m1 = (-log_inv_hi - 1.0) + FMA(poly, red_vx, -log_inv_lo) result_exp_m1.set_attributes(tag="result_exp_m1", debug=debug_lftolx) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal, _, _, _, _ = compute_log(vx * S2100, exp_corr_factor=m100) result_subnormal.set_attributes(tag="result_subnormal", debug=debug_lftolx) one_err = S2**-7 approx_interval_one = Interval(-one_err, one_err) red_vx_one = vx - 1.0 poly_degree_one = sup( guessdegree( log(1 + x) / x, approx_interval_one, S2** -(self.precision.get_field_size() + 1))) + 1 poly_object_one = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree_one, [self.precision] * (poly_degree_one + 1), approx_interval_one, absolute).sub_poly(start_index=1) poly_one = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object_one, red_vx_one, unified_precision=self.precision) poly_one.set_attributes(tag="poly_one", debug=debug_lftolx) result_one = red_vx_one + red_vx_one * poly_one cond_one = (vx < (1 + one_err)) & (vx > (1 - one_err)) cond_one.set_attributes(tag="cond_one", debug=debugd, likely=False) # main scheme pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Statement(ClearException(), result_subnormal, Return(result_subnormal))), ConditionBlock( vx_one, Statement( ClearException(), Return(FP_PlusZero(self.precision)), ), ConditionBlock(exp_mone, Return(result_exp_m1), Return(result)))))) scheme = Statement(result, pre_scheme) return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) C_m1 = Constant(-1, precision = self.precision) test_NaN_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = debug_multi, tag = "NaN_or_inf", precision = ML_Bool) test_NaN = Test(vx, specifier = Test.IsNaN, likely = False, debug = debug_multi, tag = "is_NaN", precision = ML_Bool) test_inf = Comparison(vx, 0, specifier = Comparison.Greater, debug = debug_multi, tag = "sign", precision = ML_Bool, likely = False); # Infnty input infty_return = Statement(ConditionBlock(test_inf, Return(FP_PlusInfty(self.precision)), Return(C_m1))) # non-std input (inf/nan) specific_return = ConditionBlock(test_NaN, Return(FP_QNaN(self.precision)), infty_return) # Over/Underflow Tests precision_emax = self.precision.get_emax() precision_max_value = S2**(precision_emax + 1) expm1_overflow_bound = ceil(log(precision_max_value + 1)) overflow_test = Comparison(vx, expm1_overflow_bound, likely = False, specifier = Comparison.Greater, precision = ML_Bool) overflow_return = Statement(Return(FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2** precision_emin expm1_underflow_bound = floor(log(precision_min_value) + 1) underflow_test = Comparison(vx, expm1_underflow_bound, likely = False, specifier = Comparison.Less, precision = ML_Bool) underflow_return = Statement(Return(C_m1)) sollya_precision = {ML_Binary32: sollya.binary32, ML_Binary64: sollya.binary64}[self.precision] int_precision = {ML_Binary32: ML_Int32, ML_Binary64: ML_Int64}[self.precision] # Constants log_2 = round(log(2), sollya_precision, sollya.RN) invlog2 = round(1/log(2), sollya_precision, sollya.RN) log_2_cst = Constant(log_2, precision = self.precision) interval_vx = Interval(expm1_underflow_bound, expm1_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - 6 log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = round(log(2) - log2_hi, sollya_precision, sollya.RN) # Reduction unround_k = vx * invlog2 ik = NearestInteger(unround_k, precision = int_precision, debug = debug_multi, tag = "ik") k = Conversion(ik, precision = self.precision, tag = "k") red_coeff1 = Multiplication(k, log2_hi, precision = self.precision) red_coeff2 = Multiplication(Negation(k, precision = self.precision), log2_lo, precision = self.precision) pre_sub_mul = Subtraction(vx, red_coeff1, precision = self.precision) s = Addition(pre_sub_mul, red_coeff2, precision = self.precision) z = Subtraction(s, pre_sub_mul, precision = self.precision) t = Subtraction(red_coeff2, z, precision = self.precision) r = Addition(s, t, precision = self.precision) r.set_attributes(tag = "r", debug = debug_multi) r_interval = Interval(-log_2/S2, log_2/S2) local_ulp = sup(ulp(exp(r_interval), self.precision)) print("ulp: ", local_ulp) error_goal = S2**-1*local_ulp print("error goal: ", error_goal) # Polynomial Approx error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "\033[33;1m Building polynomial \033[0m\n") poly_degree = sup(guessdegree(expm1(sollya.x), r_interval, error_goal) + 1) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_degree_list = range(0, poly_degree) precision_list = [self.precision] *(len(poly_degree_list) + 1) poly_object, poly_error = Polynomial.build_from_approximation_with_error(expm1(sollya.x), poly_degree, precision_list, r_interval, sollya.absolute, error_function = error_function) sub_poly = poly_object.sub_poly(start_index = 2) Log.report(Log.Info, "Poly : %s" % sub_poly) Log.report(Log.Info, "poly error : {} / {:d}".format(poly_error, int(sollya.log2(poly_error)))) pre_sub_poly = polynomial_scheme_builder(sub_poly, r, unified_precision = self.precision) poly = r + pre_sub_poly poly.set_attributes(tag = "poly", debug = debug_multi) exp_k = ExponentInsertion(ik, tag = "exp_k", debug = debug_multi, precision = self.precision) exp_mk = ExponentInsertion(-ik, tag = "exp_mk", debug = debug_multi, precision = self.precision) diff = 1 - exp_mk diff.set_attributes(tag = "diff", debug = debug_multi) # Late Tests late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier = Comparison.Greater, likely = False, debug = debug_multi, tag = "late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = ik - overflow_exp_offset exp_diff_k = ExponentInsertion(diff_k, precision = self.precision, tag = "exp_diff_k", debug = debug_multi) exp_oflow_offset = ExponentInsertion(overflow_exp_offset, precision = self.precision, tag = "exp_offset", debug = debug_multi) late_overflow_result = (exp_diff_k * (1 + poly)) * exp_oflow_offset - 1.0 late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier = Test.IsInfty, likely = False), ExpRaiseReturn(ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision)), Return(late_overflow_result) ) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier = Comparison.LessOrEqual, likely = False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_coeff = ik + underflow_exp_offset exp_corrected = ExponentInsertion(corrected_coeff, precision = self.precision) exp_uflow_offset = ExponentInsertion(-underflow_exp_offset, precision = self.precision) late_underflow_result = ( exp_corrected * (1 + poly)) * exp_uflow_offset - 1.0 test_subnormal = Test(late_underflow_result, specifier = Test.IsSubnormal, likely = False) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value = late_underflow_result)), Return(late_underflow_result) ) # Reconstruction std_result = exp_k * ( poly + diff ) std_result.set_attributes(tag = "result", debug = debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock( late_underflow_test, late_underflow_return, Return(std_result) ) ) std_return = ConditionBlock( overflow_test, overflow_return, ConditionBlock( underflow_test, underflow_return, result_scheme) ) scheme = ConditionBlock( test_NaN_or_inf, Statement(specific_return), std_return ) return scheme
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = Abs(self.implementation.add_input_variable("x", self.precision), tag="vx") Log.report(Log.Info, "generating implementation scheme") if self.debug_flag: Log.report(Log.Info, "debug has been enabled") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) debug_precision = { ML_Binary32: debug_ftox, ML_Binary64: debug_lftolx }[self.precision] test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input sollya_precision = self.precision.get_sollya_object() hi_precision = self.precision.get_field_size() - 3 # argument reduction frac_pi_index = 3 frac_pi = round(S2**frac_pi_index / pi, sollya_precision, sollya.RN) inv_frac_pi = round(pi / S2**frac_pi_index, hi_precision, sollya.RN) inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi, sollya_precision, sollya.RN) # computing k = E(x * frac_pi) vx_pi = Multiplication(vx, frac_pi, precision=self.precision) k = NearestInteger(vx_pi, precision=ML_Int32, tag="k", debug=True) fk = Conversion(k, precision=self.precision, tag="fk") inv_frac_pi_cst = Constant(inv_frac_pi, tag="inv_frac_pi", precision=self.precision) inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo, tag="inv_frac_pi_lo", precision=self.precision) red_vx_hi = (vx - inv_frac_pi_cst * fk) red_vx_hi.set_attributes(tag="red_vx_hi", debug=debug_precision, precision=self.precision) red_vx_lo_sub = inv_frac_pi_lo_cst * fk red_vx_lo_sub.set_attributes(tag="red_vx_lo_sub", debug=debug_precision, unbreakable=True, precision=self.precision) vx_d = Conversion(vx, precision=ML_Binary64, tag="vx_d") pre_red_vx = red_vx_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d_hi = (vx_d - inv_frac_pi_cst * fk) pre_red_vx_d_hi.set_attributes(tag="pre_red_vx_d_hi", precision=ML_Binary64, debug=debug_lftolx) pre_red_vx_d = pre_red_vx_d_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d.set_attributes(tag="pre_red_vx_d", debug=debug_lftolx, precision=ML_Binary64) modk = Modulo(k, 2**(frac_pi_index + 1), precision=ML_Int32, tag="switch_value", debug=True) sel_c = Equal(BitLogicAnd(modk, 2**(frac_pi_index - 1)), 2**(frac_pi_index - 1)) red_vx = Select(sel_c, -pre_red_vx, pre_red_vx) red_vx.set_attributes(tag="red_vx", debug=debug_precision, precision=self.precision) red_vx_d = Select(sel_c, -pre_red_vx_d, pre_red_vx_d) red_vx_d.set_attributes(tag="red_vx_d", debug=debug_lftolx, precision=ML_Binary64) approx_interval = Interval(-pi / (S2**(frac_pi_index + 1)), pi / S2**(frac_pi_index + 1)) Log.report(Log.Info, "approx interval: %s\n" % approx_interval) error_goal_approx = S2**-self.precision.get_precision() Log.report(Log.Info, "building mathematical polynomial") poly_degree_vector = [None] * 2**(frac_pi_index + 1) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme index_relative = [] poly_object_vector = [None] * 2**(frac_pi_index + 1) for i in range(2**(frac_pi_index + 1)): sub_func = cos(sollya.x + i * pi / S2**frac_pi_index) degree = int( sup(guessdegree(sub_func, approx_interval, error_goal_approx))) + 1 degree_list = range(degree + 1) a_interval = approx_interval if i == 0: # ad-hoc, TODO: to be cleaned degree = 6 degree_list = range(0, degree + 1, 2) elif i % 2**(frac_pi_index) == 2**(frac_pi_index - 1): # for pi/2 and 3pi/2, an approx to sin=cos(pi/2+x) # must be generated degree_list = range(1, degree + 1, 2) if i == 3 or i == 5 or i == 7 or i == 9: precision_list = [sollya.binary64 ] + [sollya.binary32] * (degree) else: precision_list = [sollya.binary32] * (degree + 1) poly_degree_vector[i] = degree constraint = sollya.absolute delta = (2**(frac_pi_index - 3)) centered_i = (i % 2**(frac_pi_index)) - 2**(frac_pi_index - 1) if centered_i < delta and centered_i > -delta and centered_i != 0: constraint = sollya.relative index_relative.append(i) Log.report( Log.Info, "generating approximation for %d/%d" % (i, 2**(frac_pi_index + 1))) poly_object_vector[ i], _ = Polynomial.build_from_approximation_with_error( sub_func, degree_list, precision_list, a_interval, constraint, error_function=error_function) # unified power map for red_sx^n upm = {} rel_error_list = [] poly_scheme_vector = [None] * (2**(frac_pi_index + 1)) for i in range(2**(frac_pi_index + 1)): poly_object = poly_object_vector[i] poly_precision = self.precision if i == 3 or i == 5 or i == 7 or i == 9: poly_precision = ML_Binary64 c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision=ML_Binary64) c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) poly_hi = (c0 + c1 * red_vx) poly_hi.set_precision(ML_Binary64) red_vx_d_2 = red_vx_d * red_vx_d poly_scheme = poly_hi + red_vx_d_2 * polynomial_scheme_builder( poly_object.sub_poly(start_index=2, offset=2), red_vx, unified_precision=self.precision, power_map_=upm) poly_scheme.set_attributes(unbreakable=True) elif i == 4: c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=ML_Binary64) poly_scheme = c1 * red_vx_d + polynomial_scheme_builder( poly_object.sub_poly(start_index=2), red_vx, unified_precision=self.precision, power_map_=upm) poly_scheme.set_precision(ML_Binary64) else: poly_scheme = polynomial_scheme_builder( poly_object, red_vx, unified_precision=poly_precision, power_map_=upm) #if i == 3: # c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision = self.precision) # c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = self.precision) # poly_scheme = (c0 + c1 * red_vx) + polynomial_scheme_builder(poly_object.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) poly_scheme.set_attributes(tag="poly_cos%dpi%d" % (i, 2**(frac_pi_index)), debug=debug_precision) poly_scheme_vector[i] = poly_scheme #try: if is_gappa_installed() and i == 3: opt_scheme = self.opt_engine.optimization_process( poly_scheme, self.precision, copy=True, fuse_fma=self.fuse_fma) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_scheme, tag_map) gappa_vx = Variable("red_vx", precision=self.precision, interval=approx_interval) cg_eval_error_copy_map = { tag_map["red_vx"]: gappa_vx, tag_map["red_vx_d"]: gappa_vx, } print "opt_scheme" print opt_scheme.get_str(depth=None, display_precision=True, memoization_map={}) eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_scheme, cg_eval_error_copy_map, gappa_filename="red_arg_%d.g" % i) poly_range = cos(approx_interval + i * pi / S2**frac_pi_index) rel_error_list.append(eval_error / poly_range) #for rel_error in rel_error_list: # print sup(abs(rel_error)) #return # case 17 #poly17 = poly_object_vector[17] #c0 = Constant(coeff(poly17.get_sollya_object(), 0), precision = self.precision) #c1 = Constant(coeff(poly17.get_sollya_object(), 1), precision = self.precision) #poly_scheme_vector[17] = FusedMultiplyAdd(c1, red_vx, c0, specifier = FusedMultiplyAdd.Standard) + polynomial_scheme_builder(poly17.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) half = 2**frac_pi_index sub_half = 2**(frac_pi_index - 1) # determine if the reduced input is within the second and third quarter (not first nor fourth) # to negate the cosine output factor_cond = BitLogicAnd(BitLogicXor( BitLogicRightShift(modk, frac_pi_index), BitLogicRightShift(modk, frac_pi_index - 1)), 1, tag="factor_cond", debug=True) CM1 = Constant(-1, precision=self.precision) C1 = Constant(1, precision=self.precision) factor = Select(factor_cond, CM1, C1, tag="factor", debug=debug_precision) factor2 = Select(Equal(modk, Constant(sub_half)), CM1, C1, tag="factor2", debug=debug_precision) switch_map = {} if 0: for i in range(2**(frac_pi_index + 1)): switch_map[i] = Return(poly_scheme_vector[i]) else: for i in range(2**(frac_pi_index - 1)): switch_case = (i, half - i) #switch_map[i] = Return(poly_scheme_vector[i]) #switch_map[half-i] = Return(-poly_scheme_vector[i]) if i != 0: switch_case = switch_case + (half + i, 2 * half - i) #switch_map[half+i] = Return(-poly_scheme_vector[i]) #switch_map[2*half-i] = Return(poly_scheme_vector[i]) if poly_scheme_vector[i].get_precision() != self.precision: poly_result = Conversion(poly_scheme_vector[i], precision=self.precision) else: poly_result = poly_scheme_vector[i] switch_map[switch_case] = Return(factor * poly_result) #switch_map[sub_half] = Return(-poly_scheme_vector[sub_half]) #switch_map[half + sub_half] = Return(poly_scheme_vector[sub_half]) switch_map[(sub_half, half + sub_half)] = Return( factor2 * poly_scheme_vector[sub_half]) result = SwitchBlock(modk, switch_map) ####################################################################### # LARGE ARGUMENT MANAGEMENT # # (lar: Large Argument Reduction) # ####################################################################### # payne and hanek argument reduction for large arguments #red_func_name = "payne_hanek_cosfp32" # "payne_hanek_fp32_asm" red_func_name = "payne_hanek_fp32_asm" payne_hanek_func_op = FunctionOperator( red_func_name, arg_map={0: FO_Arg(0)}, require_header=["support_lib/ml_red_arg.h"]) payne_hanek_func = FunctionObject(red_func_name, [ML_Binary32], ML_Binary64, payne_hanek_func_op) payne_hanek_func_op.declare_prototype = payne_hanek_func #large_arg_red = FunctionCall(payne_hanek_func, vx) large_arg_red = payne_hanek_func(vx) red_bound = S2**20 cond = Abs(vx) >= red_bound cond.set_attributes(tag="cond", likely=False) lar_neark = NearestInteger(large_arg_red, precision=ML_Int64) lar_modk = Modulo(lar_neark, Constant(16, precision=ML_Int64), tag="lar_modk", debug=True) # Modulo is supposed to be already performed (by payne_hanek_cosfp32) #lar_modk = NearestInteger(large_arg_red, precision = ML_Int64) pre_lar_red_vx = large_arg_red - Conversion(lar_neark, precision=ML_Binary64) pre_lar_red_vx.set_attributes(precision=ML_Binary64, debug=debug_lftolx, tag="pre_lar_red_vx") lar_red_vx = Conversion(pre_lar_red_vx, precision=self.precision, debug=debug_precision, tag="lar_red_vx") lar_red_vx_lo = Conversion( pre_lar_red_vx - Conversion(lar_red_vx, precision=ML_Binary64), precision=self.precision) lar_red_vx_lo.set_attributes(tag="lar_red_vx_lo", precision=self.precision) lar_k = 3 # large arg reduction Universal Power Map lar_upm = {} lar_switch_map = {} approx_interval = Interval(-0.5, 0.5) for i in range(2**(lar_k + 1)): frac_pi = pi / S2**lar_k func = cos(frac_pi * i + frac_pi * sollya.x) degree = 6 error_mode = sollya.absolute if i % 2**(lar_k) == 2**(lar_k - 1): # close to sin(x) cases func = -sin(frac_pi * x) if i == 2**(lar_k - 1) else sin(frac_pi * x) degree_list = range(0, degree + 1, 2) precision_list = [sollya.binary32] * len(degree_list) poly_object, _ = Polynomial.build_from_approximation_with_error( func / x, degree_list, precision_list, approx_interval, error_mode) poly_object = poly_object.sub_poly(offset=-1) else: degree_list = range(degree + 1) precision_list = [sollya.binary32] * len(degree_list) poly_object, _ = Polynomial.build_from_approximation_with_error( func, degree_list, precision_list, approx_interval, error_mode) if i == 3 or i == 5 or i == 7 or i == 9 or i == 11 or i == 13: poly_precision = ML_Binary64 c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision=ML_Binary64) c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) poly_hi = (c0 + c1 * lar_red_vx) poly_hi.set_precision(ML_Binary64) pre_poly_scheme = poly_hi + polynomial_scheme_builder( poly_object.sub_poly(start_index=2), lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) pre_poly_scheme.set_attributes(precision=ML_Binary64) poly_scheme = Conversion(pre_poly_scheme, precision=self.precision) elif i == 4 or i == 12: c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) c3 = Constant(coeff(poly_object.get_sollya_object(), 3), precision=self.precision) c5 = Constant(coeff(poly_object.get_sollya_object(), 5), precision=self.precision) poly_hi = polynomial_scheme_builder( poly_object.sub_poly(start_index=3), lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) poly_hi.set_attributes(tag="poly_lar_%d_hi" % i, precision=ML_Binary64) poly_scheme = Conversion(FusedMultiplyAdd( c1, lar_red_vx, poly_hi, precision=ML_Binary64) + c1 * lar_red_vx_lo, precision=self.precision) else: poly_scheme = polynomial_scheme_builder( poly_object, lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) # poly_scheme = polynomial_scheme_builder(poly_object, lar_red_vx, unified_precision = self.precision, power_map_ = lar_upm) poly_scheme.set_attributes(tag="lar_poly_%d" % i, debug=debug_precision) lar_switch_map[(i, )] = Return(poly_scheme) lar_result = SwitchBlock(lar_modk, lar_switch_map) # main scheme #Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") # scheme = Statement(ConditionBlock(cond, lar_result, result)) Log.report(Log.Info, "Construction of the initial MDL scheme") scheme = Statement(pre_red_vx_d, red_vx_lo_sub, ConditionBlock(cond, lar_result, result)) return scheme
def generate_scalar_scheme(self, vx): Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") index_size = 5 comp_lo = (vx < 0) comp_lo.set_attributes(tag = "comp_lo", precision = ML_Bool) sign = Select(comp_lo, -1, 1, precision = self.precision) # as sinh is an odd function, we can simplify the input to its absolute # value once the sign has been extracted vx = Abs(vx) int_precision = self.precision.get_integer_format() # argument reduction arg_reg_value = log(2)/2**index_size inv_log2_value = round(1/arg_reg_value, self.precision.get_sollya_object(), sollya.RN) inv_log2_cst = Constant(inv_log2_value, precision = self.precision, tag = "inv_log2") # for r_hi to be accurate we ensure k * log2_hi_value_cst is exact # by limiting the number of non-zero bits in log2_hi_value_cst # cosh(x) ~ exp(abs(x))/2 for a big enough x # cosh(x) > 2^1023 <=> exp(x) > 2^1024 <=> x > log(2^1024) # k = inv_log2_value * x # -1 for guard max_k_approx = inv_log2_value * log(sollya.SollyaObject(2)**1024) max_k_bitsize = int(ceil(log2(max_k_approx))) Log.report(Log.Info, "max_k_bitsize: %d" % max_k_bitsize) log2_hi_value_precision = self.precision.get_precision() - max_k_bitsize - 1 log2_hi_value = round(arg_reg_value, log2_hi_value_precision, sollya.RN) log2_lo_value = round(arg_reg_value - log2_hi_value, self.precision.get_sollya_object(), sollya.RN) log2_hi_value_cst = Constant(log2_hi_value, tag = "log2_hi_value", precision = self.precision) log2_lo_value_cst = Constant(log2_lo_value, tag = "log2_lo_value", precision = self.precision) k = Trunc(Multiplication(inv_log2_cst, vx), precision = self.precision) k_log2 = Multiplication(k, log2_hi_value_cst, precision = self.precision, exact = True, tag = "k_log2", unbreakable = True) r_hi = vx - k_log2 r_hi.set_attributes(tag = "r_hi", debug = debug_multi, unbreakable = True) r_lo = -k * log2_lo_value_cst # reduced argument r = r_hi + r_lo r.set_attributes(tag = "r", debug = debug_multi) if is_gappa_installed(): r_eval_error = self.get_eval_error(r_hi, variable_copy_map = { vx: Variable("vx", interval = Interval(0, 715), precision = self.precision), k: Variable("k", interval = Interval(0, 1024), precision = self.precision) }) Log.report(Log.Verbose, "r_eval_error: ", r_eval_error) approx_interval = Interval(-arg_reg_value, arg_reg_value) error_goal_approx = 2**-(self.precision.get_precision()) poly_degree = sup(guessdegree(exp(sollya.x), approx_interval, error_goal_approx)) + 3 precision_list = [1] + [self.precision] * (poly_degree) k_integer = Conversion(k, precision = int_precision, tag = "k_integer", debug = debug_multi) k_hi = BitLogicRightShift(k_integer, Constant(index_size, precision=int_precision), tag = "k_int_hi", precision = int_precision, debug = debug_multi) k_lo = Modulo(k_integer, 2**index_size, tag = "k_int_lo", precision = int_precision, debug = debug_multi) pow_exp = ExponentInsertion(Conversion(k_hi, precision = int_precision), precision = self.precision, tag = "pow_exp", debug = debug_multi) exp_table = ML_NewTable(dimensions = [2 * 2**index_size, 4], storage_precision = self.precision, tag = self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i reduced_hi_prec = int(self.precision.get_mantissa_size() - 8) # using SollyaObject wrapper to force evaluation by sollya # with higher precision exp_value = sollya.SollyaObject(2)**((input_value)* 2**-index_size) mexp_value = sollya.SollyaObject(2)**((-input_value)* 2**-index_size) pos_value_hi = round(exp_value, reduced_hi_prec, sollya.RN) pos_value_lo = round(exp_value - pos_value_hi, self.precision.get_sollya_object(), sollya.RN) neg_value_hi = round(mexp_value, reduced_hi_prec, sollya.RN) neg_value_lo = round(mexp_value - neg_value_hi, self.precision.get_sollya_object(), sollya.RN) exp_table[i][0] = neg_value_hi exp_table[i][1] = neg_value_lo exp_table[i][2] = pos_value_hi exp_table[i][3] = pos_value_lo # log2_value = log(2) / 2^index_size # sinh(x) = 1/2 * (exp(x) - exp(-x)) # exp(x) = exp(x - k * log2_value + k * log2_value) # # r = x - k * log2_value # exp(x) = exp(r) * 2 ^ (k / 2^index_size) # # k / 2^index_size = h + l * 2^-index_size, with k, h, l integers # exp(x) = exp(r) * 2^h * 2^(l *2^-index_size) # # sinh(x) = exp(r) * 2^(h-1) * 2^(l *2^-index_size) - exp(-r) * 2^(-h-1) * 2^(-l *2^-index_size) # S=2^(h-1), T = 2^(-h-1) # exp(r) = 1 + poly_pos(r) # exp(-r) = 1 + poly_neg(r) # 2^(l / 2^index_size) = pos_value_hi + pos_value_lo # 2^(-l / 2^index_size) = neg_value_hi + neg_value_lo # error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function = error_function) Log.report(Log.Verbose, "poly_approx_error: {}, {}".format(poly_approx_error, float(log2(poly_approx_error)))) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_pos = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), r, unified_precision = self.precision) poly_pos.set_attributes(tag = "poly_pos", debug = debug_multi) poly_neg = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), -r, unified_precision = self.precision) poly_neg.set_attributes(tag = "poly_neg", debug = debug_multi) table_index = Addition(k_lo, Constant(2**index_size, precision = int_precision), precision = int_precision, tag = "table_index", debug = debug_multi) neg_value_load_hi = TableLoad(exp_table, table_index, 0, tag = "neg_value_load_hi", debug = debug_multi) neg_value_load_lo = TableLoad(exp_table, table_index, 1, tag = "neg_value_load_lo", debug = debug_multi) pos_value_load_hi = TableLoad(exp_table, table_index, 2, tag = "pos_value_load_hi", debug = debug_multi) pos_value_load_lo = TableLoad(exp_table, table_index, 3, tag = "pos_value_load_lo", debug = debug_multi) k_plus = Max( Subtraction(k_hi, Constant(1, precision = int_precision), precision=int_precision, tag="k_plus", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) k_neg = Max( Subtraction(-k_hi, Constant(1, precision=int_precision), precision=int_precision, tag="k_neg", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) # 2^(h-1) pow_exp_pos = ExponentInsertion(k_plus, precision = self.precision, tag="pow_exp_pos", debug=debug_multi) # 2^(-h-1) pow_exp_neg = ExponentInsertion(k_neg, precision = self.precision, tag="pow_exp_neg", debug=debug_multi) hi_terms = (pos_value_load_hi * pow_exp_pos - neg_value_load_hi * pow_exp_neg) hi_terms.set_attributes(tag = "hi_terms", debug=debug_multi) pos_exp = (pos_value_load_hi * poly_pos + (pos_value_load_lo + pos_value_load_lo * poly_pos)) * pow_exp_pos pos_exp.set_attributes(tag = "pos_exp", debug = debug_multi) neg_exp = (neg_value_load_hi * poly_neg + (neg_value_load_lo + neg_value_load_lo * poly_neg)) * pow_exp_neg neg_exp.set_attributes(tag = "neg_exp", debug = debug_multi) result = Addition( Subtraction( pos_exp, neg_exp, precision=self.precision, ), hi_terms, precision=self.precision, tag="result", debug=debug_multi ) # ov_value ov_value = round(asinh(self.precision.get_max_value()), self.precision.get_sollya_object(), sollya.RD) ov_flag = Comparison(Abs(vx), Constant(ov_value, precision = self.precision), specifier = Comparison.Greater) # main scheme scheme = Statement( Return( Select( ov_flag, sign*FP_PlusInfty(self.precision), sign*result ))) return scheme
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "generating implementation scheme") if self.debug_flag: Log.report(Log.Info, "debug has been enabled") # local overloading of RaiseReturn operation def SincosRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) sollya_precision = self.precision.get_sollya_object() hi_precision = self.precision.get_field_size() - 8 cw_hi_precision = self.precision.get_field_size() - 4 ext_precision = { ML_Binary32: ML_Binary64, ML_Binary64: ML_Binary64 }[self.precision] int_precision = { ML_Binary32: ML_Int32, ML_Binary64: ML_Int64 }[self.precision] if self.precision is ML_Binary32: ph_bound = S2**10 else: ph_bound = S2**33 test_ph_bound = Comparison(vx, ph_bound, specifier=Comparison.GreaterOrEqual, precision=ML_Bool, likely=False) # argument reduction # m frac_pi_index = {ML_Binary32: 10, ML_Binary64: 14}[self.precision] C0 = Constant(0, precision=int_precision) C1 = Constant(1, precision=int_precision) C_offset = Constant(3 * S2**(frac_pi_index - 1), precision=int_precision) # 2^m / pi frac_pi = round(S2**frac_pi_index / pi, cw_hi_precision, sollya.RN) frac_pi_lo = round(S2**frac_pi_index / pi - frac_pi, sollya_precision, sollya.RN) # pi / 2^m, high part inv_frac_pi = round(pi / S2**frac_pi_index, cw_hi_precision, sollya.RN) # pi / 2^m, low part inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi, sollya_precision, sollya.RN) # computing k vx.set_attributes(tag="vx", debug=debug_multi) vx_pi = Addition(Multiplication(vx, Constant(frac_pi, precision=self.precision), precision=self.precision), Multiplication(vx, Constant(frac_pi_lo, precision=self.precision), precision=self.precision), precision=self.precision, tag="vx_pi", debug=debug_multi) k = NearestInteger(vx_pi, precision=int_precision, tag="k", debug=debug_multi) # k in floating-point precision fk = Conversion(k, precision=self.precision, tag="fk", debug=debug_multi) inv_frac_pi_cst = Constant(inv_frac_pi, tag="inv_frac_pi", precision=self.precision, debug=debug_multi) inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo, tag="inv_frac_pi_lo", precision=self.precision, debug=debug_multi) # Cody-Waite reduction red_coeff1 = Multiplication(fk, inv_frac_pi_cst, precision=self.precision, exact=True) red_coeff2 = Multiplication(Negation(fk, precision=self.precision), inv_frac_pi_lo_cst, precision=self.precision, exact=True) # Should be exact / Sterbenz' Lemma pre_sub_mul = Subtraction(vx, red_coeff1, precision=self.precision, exact=True) # Fast2Sum s = Addition(pre_sub_mul, red_coeff2, precision=self.precision, unbreakable=True, tag="s", debug=debug_multi) z = Subtraction(s, pre_sub_mul, precision=self.precision, unbreakable=True, tag="z", debug=debug_multi) t = Subtraction(red_coeff2, z, precision=self.precision, unbreakable=True, tag="t", debug=debug_multi) red_vx_std = Addition(s, t, precision=self.precision) red_vx_std.set_attributes(tag="red_vx_std", debug=debug_multi) # To compute sine we offset x by 3pi/2 # which means add 3 * S2^(frac_pi_index-1) to k if self.sin_output: Log.report(Log.Info, "Computing Sin") offset_k = Addition(k, C_offset, precision=int_precision, tag="offset_k") else: Log.report(Log.Info, "Computing Cos") offset_k = k modk = Variable("modk", precision=int_precision, var_type=Variable.Local) red_vx = Variable("red_vx", precision=self.precision, var_type=Variable.Local) # Faster modulo using bitwise logic modk_std = BitLogicAnd(offset_k, 2**(frac_pi_index + 1) - 1, precision=int_precision, tag="modk", debug=debug_multi) approx_interval = Interval(-pi / (S2**(frac_pi_index + 1)), pi / S2**(frac_pi_index + 1)) red_vx.set_interval(approx_interval) Log.report(Log.Info, "approx interval: %s\n" % approx_interval) Log.report(Log.Info, "building tabulated approximation for sin and cos") error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) # polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme table_index_size = frac_pi_index + 1 cos_table = ML_NewTable(dimensions=[2**table_index_size, 1], storage_precision=self.precision, tag=self.uniquify_name("cos_table")) for i in range(2**(frac_pi_index + 1)): local_x = i * pi / S2**frac_pi_index cos_local = round(cos(local_x), self.precision.get_sollya_object(), sollya.RN) cos_table[i][0] = cos_local sin_index = Modulo(modk + 2**(frac_pi_index - 1), 2**(frac_pi_index + 1), precision=int_precision, tag="sin_index") #, debug = debug_multi) tabulated_cos = TableLoad(cos_table, modk, C0, precision=self.precision, tag="tab_cos", debug=debug_multi) tabulated_sin = -TableLoad(cos_table, sin_index, C0, precision=self.precision, tag="tab_sin", debug=debug_multi) poly_degree_cos = sup( guessdegree(cos(sollya.x), approx_interval, S2** -self.precision.get_precision()) + 2) poly_degree_sin = sup( guessdegree( sin(sollya.x) / sollya.x, approx_interval, S2** -self.precision.get_precision()) + 2) poly_degree_cos_list = range(0, int(poly_degree_cos) + 3) poly_degree_sin_list = range(0, int(poly_degree_sin) + 3) # cosine polynomial: limiting first and second coefficient precision to 1-bit poly_cos_prec_list = [self.precision] * len(poly_degree_cos_list) # sine polynomial: limiting first coefficient precision to 1-bit poly_sin_prec_list = [self.precision] * len(poly_degree_sin_list) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building mathematical polynomials for sin and cos") # Polynomial approximations Log.report(Log.Info, "cos") poly_object_cos, poly_error_cos = Polynomial.build_from_approximation_with_error( cos(sollya.x), poly_degree_cos_list, poly_cos_prec_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "sin") poly_object_sin, poly_error_sin = Polynomial.build_from_approximation_with_error( sin(sollya.x), poly_degree_sin_list, poly_sin_prec_list, approx_interval, sollya.absolute, error_function=error_function) Log.report( Log.Info, "poly error cos: {} / {:d}".format( poly_error_cos, int(sollya.log2(poly_error_cos)))) Log.report( Log.Info, "poly error sin: {0} / {1:d}".format( poly_error_sin, int(sollya.log2(poly_error_sin)))) Log.report(Log.Info, "poly cos : %s" % poly_object_cos) Log.report(Log.Info, "poly sin : %s" % poly_object_sin) # Polynomial evaluation scheme poly_cos = polynomial_scheme_builder( poly_object_cos.sub_poly(start_index=1), red_vx, unified_precision=self.precision) poly_sin = polynomial_scheme_builder( poly_object_sin.sub_poly(start_index=2), red_vx, unified_precision=self.precision) poly_cos.set_attributes(tag="poly_cos", debug=debug_multi) poly_sin.set_attributes(tag="poly_sin", debug=debug_multi, unbreakable=True) # TwoProductFMA mul_cos_x = tabulated_cos * poly_cos mul_cos_y = FusedMultiplyAdd(tabulated_cos, poly_cos, -mul_cos_x, precision=self.precision) mul_sin_x = tabulated_sin * poly_sin mul_sin_y = FusedMultiplyAdd(tabulated_sin, poly_sin, -mul_sin_x, precision=self.precision) mul_coeff_sin_hi = tabulated_sin * red_vx mul_coeff_sin_lo = FusedMultiplyAdd(tabulated_sin, red_vx, -mul_coeff_sin_hi) mul_cos = Addition(mul_cos_x, mul_cos_y, precision=self.precision, tag="mul_cos") #, debug = debug_multi) mul_sin = Negation(Addition(mul_sin_x, mul_sin_y, precision=self.precision), precision=self.precision, tag="mul_sin") #, debug = debug_multi) mul_coeff_sin = Negation(Addition(mul_coeff_sin_hi, mul_coeff_sin_lo, precision=self.precision), precision=self.precision, tag="mul_coeff_sin") #, debug = debug_multi) mul_cos_x.set_attributes( tag="mul_cos_x", precision=self.precision) #, debug = debug_multi) mul_cos_y.set_attributes( tag="mul_cos_y", precision=self.precision) #, debug = debug_multi) mul_sin_x.set_attributes( tag="mul_sin_x", precision=self.precision) #, debug = debug_multi) mul_sin_y.set_attributes( tag="mul_sin_y", precision=self.precision) #, debug = debug_multi) cos_eval_d_1 = (((mul_cos + mul_sin) + mul_coeff_sin) + tabulated_cos) cos_eval_d_1.set_attributes(tag="cos_eval_d_1", precision=self.precision, debug=debug_multi) result_1 = Statement(Return(cos_eval_d_1)) ####################################################################### # LARGE ARGUMENT MANAGEMENT # # (lar: Large Argument Reduction) # ####################################################################### # payne and hanek argument reduction for large arguments ph_k = frac_pi_index ph_frac_pi = round(S2**ph_k / pi, 1500, sollya.RN) ph_inv_frac_pi = pi / S2**ph_k ph_statement, ph_acc, ph_acc_int = generate_payne_hanek(vx, ph_frac_pi, self.precision, n=100, k=ph_k) # assigning Large Argument Reduction reduced variable lar_vx = Variable("lar_vx", precision=self.precision, var_type=Variable.Local) lar_red_vx = Addition(Multiplication(lar_vx, inv_frac_pi, precision=self.precision), Multiplication(lar_vx, inv_frac_pi_lo, precision=self.precision), precision=self.precision, tag="lar_red_vx", debug=debug_multi) C32 = Constant(2**(ph_k + 1), precision=int_precision, tag="C32") ph_acc_int_red = Select(ph_acc_int < C0, C32 + ph_acc_int, ph_acc_int, precision=int_precision, tag="ph_acc_int_red") if self.sin_output: lar_offset_k = Addition(ph_acc_int_red, C_offset, precision=int_precision, tag="lar_offset_k") else: lar_offset_k = ph_acc_int_red ph_acc_int_red.set_attributes(tag="ph_acc_int_red", debug=debug_multi) lar_modk = BitLogicAnd(lar_offset_k, 2**(frac_pi_index + 1) - 1, precision=int_precision, tag="lar_modk", debug=debug_multi) lar_statement = Statement(ph_statement, ReferenceAssign(lar_vx, ph_acc, debug=debug_multi), ReferenceAssign(red_vx, lar_red_vx, debug=debug_multi), ReferenceAssign(modk, lar_modk), prevent_optimization=True) test_NaN_or_Inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, tag="NaN_or_Inf", debug=debug_multi) return_NaN_or_Inf = Statement(Return(FP_QNaN(self.precision))) scheme = ConditionBlock( test_NaN_or_Inf, Statement(ClearException(), return_NaN_or_Inf), Statement( modk, red_vx, ConditionBlock( test_ph_bound, lar_statement, Statement( ReferenceAssign(modk, modk_std), ReferenceAssign(red_vx, red_vx_std), )), result_1)) return scheme
def div_numeric_emulate(vx): sollya_format = self.precision.get_sollya_object() return sollya.round(1.0 / vx, sollya_format, rnd_mode)
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log1pf.c", function_name="log1pf"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round( log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_Table(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in xrange(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = (1.0 + (inv_approx_table[i][0] / S2**9)) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup( guessdegree( log1p(sollya.x) / sollya.x, ctz_interval, S2** -(self.precision.get_field_size() + 1))) + 1 ctz_poly_object = Polynomial.build_from_approximation( log1p(sollya.x) / sollya.x, ctz_poly_degree, [self.precision] * (ctz_poly_degree + 1), ctz_interval, sollya.absolute) print "generating polynomial evaluation scheme" ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme( ctz_poly_object, vx, unified_precision=self.precision) ctz_poly.set_attributes(tag="ctz_poly", debug=debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") log_function_code = CodeFunction( "new_log", [Variable("x", precision=ML_Binary64)], output_format=ML_Binary64) log_call_generator = FunctionOperator( log_function_code.get_name(), arity=1, output_precision=ML_Binary64, declare_prototype=log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64, ), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag="pre_vxp1", debug=debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag="pre_vxp1_exp", debug=debugd) cm500 = Constant(-500, precision=ML_Int32) c0 = Constant(0, precision=ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size() - 2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision=self.precision, tag="scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag="vxp1", debug=debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag="vxp1_exp", debug=debugd) vxp1_inv = DivisionSeed(vxp1, precision=self.precision, tag="vxp1_inv", debug=debug_lftolx, silent=True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision=self.precision, tag="vxp1_dirty_inv", debug=debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(vxp1, precision=int_precision, debug=debuglx), self.precision.get_field_size() - 7, debug=debuglx), 0x7f, tag="table_index", debug=debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag="red_vxp1", debug=debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision print "building mathematical polynomial" approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) print "generating polynomial evaluation scheme" _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vxp1, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print global_poly_object.get_sollya_object() vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag="vxp1_inv_exp", debug=debugd) corr_exp = -vxp1_exp + scaling_factor_exp # vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = -corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_lftolx, prevent_optimization=True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = -corr_exp * log2_lo exact_log2_lo_exp.set_attributes( tag="exact_log2_lo_exp", debug=debug_lftolx) #, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag="init", debug=debug_lftolx, prevent_optimization=True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag="fma0", debug=debug_lftolx) step0 = fma0 step0.set_attributes( tag="step0", debug=debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag="step1", debug=debug_lftolx, prevent_optimization=True) step2 = -log_inv_hi + step1 step2.set_attributes(tag="step2", debug=debug_lftolx, prevent_optimization=True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag="std_result", debug=debug_lftolx, prevent_optimization=True) # main scheme print "MDL scheme" pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement(Return(ctz_result), ), Statement(Return(std_result)))))) scheme = pre_scheme #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # constant computation invlog2 = round(1 / log(2), sollya_precision, sollya.RN) invlog2_cst = Constant(invlog2, precision=self.precision) #v_log2_hi = round(log(2), 16, sollya.RN) #v_log2_lo = round(log(2) - v_log2_hi, sollya_precision, sollya.RN) #log2_hi = Constant(v_log2_hi, precision = self.precision, tag = "log2_hi") #log2_lo = Constant(v_log2_lo, precision = self.precision, tag = "log2_lo") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) v_log2_hi = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) v_log2_lo = round( log(2) - v_log2_hi, self.precision.sollya_object, sollya.RN) log2_hi = Constant(v_log2_hi, precision=self.precision, tag="log2_hi") log2_lo = Constant(v_log2_lo, precision=self.precision, tag="log2_lo") vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debug_multi) int_precision = self.precision.get_integer_format() # table creation table_index_size = 7 log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("inv_table")) log_table[0][0] = 0.0 log_table[0][1] = 0.0 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) integer_precision = { ML_Binary32: ML_UInt32, ML_Binary64: ML_UInt64 }[self.precision] for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = inv_approx_table[ i] # (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", debug=debug_multi, precision=self.precision) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debug_multi) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(_vx_mant, precision=int_precision, debug=debug_multi), self.precision.get_field_size() - 7, debug=debug_multi), 0x7f, tag="table_index", debug=debug_multi) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(ReciprocalSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_multi, silent=True), precision=integer_precision), Constant(-2, precision=integer_precision), precision=integer_precision), precision=self.precision, tag="pre_arg_red_index", debug=debug_multi) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index) #_red_vx = arg_red_index * _vx_mant - 1.0 _red_vx = FusedMultiplyAdd(arg_red_index, _vx_mant, 1.0, specifier=FusedMultiplyAdd.Subtract) _red_vx.set_attributes(tag="_red_vx", debug=debug_multi) inv_err = S2**-7 red_interval = Interval(1 - inv_err, 1 + inv_err) # return in case of standard (non-special) input _log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_multi) _log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_multi) Log.report(Log.Verbose, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [1] + [self.precision] * (poly_degree), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) Log.report(Log.Verbose, "generating polynomial evaluation scheme") #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) _poly = PolynomialSchemeEvaluator.generate_estrin_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_multi) corr_exp = Conversion( _vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor, precision=self.precision) split_red_vx = Split(_red_vx, precision=ML_DoubleDouble, tag="split_red_vx", debug=debug_multi) red_vx_hi = split_red_vx.hi red_vx_lo = split_red_vx.lo # result = _red_vx * poly - log_inv_hi - log_inv_lo + _vx_exp * log2_hi + _vx_exp * log2_lo pre_result = -_log_inv_hi + (_red_vx + (_red_vx * _poly + (corr_exp * log2_lo - _log_inv_lo))) pre_result.set_attributes(tag="pre_result", debug=debug_multi) exact_log2_hi_exp = corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_multi) cancel_part = (corr_exp * log2_hi - _log_inv_hi) cancel_part.set_attributes(tag="cancel_part", debug=debug_multi) sub_part = red_vx_hi + cancel_part sub_part.set_attributes(tag="sub_part", debug=debug_multi) #result_one_low_part = (red_vx_hi * _poly + (red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part = ((red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part.set_attributes(tag="result_one_low_part", debug=debug_multi) _result_one = ( (sub_part) + red_vx_hi * _poly) + result_one_low_part return exact_log2_hi_exp + pre_result, _poly, _log_inv_lo, _log_inv_hi, _red_vx, _result_one result, poly, log_inv_lo, log_inv_hi, red_vx, new_result_one = compute_log( vx) result.set_attributes(tag="result", debug=debug_multi) new_result_one.set_attributes(tag="new_result_one", debug=debug_multi) neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debug_multi, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debug_multi, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debug_multi, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debug_multi, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debug_multi, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debug_multi, likely=False) vx_one = Equal(vx, 1.0, tag="vx_one", likely=False, debug=debug_multi) # exp=-1 case Log.report(Log.Verbose, "managing exp=-1 case") result2 = (-log_inv_hi - log2_hi) + ( (red_vx + poly * red_vx) - log2_lo - log_inv_lo) result2.set_attributes(tag="result2", debug=debug_multi) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal, _, _, _, _, _ = compute_log(vx * S2100, exp_corr_factor=m100) Log.report(Log.Verbose, "managing close to 1.0 cases") one_err = S2**-7 approx_interval_one = Interval(-one_err, one_err) red_vx_one = vx - 1.0 poly_degree_one = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval_one, S2** -(self.precision.get_field_size() + 1))) + 1 poly_object_one = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree_one, [self.precision] * (poly_degree_one + 1), approx_interval_one, sollya.absolute).sub_poly(start_index=1) poly_one = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object_one, red_vx_one, unified_precision=self.precision) poly_one.set_attributes(tag="poly_one", debug=debug_multi) result_one = red_vx_one + red_vx_one * poly_one cond_one = (vx < (1 + one_err)) & (vx > (1 - one_err)) cond_one.set_attributes(tag="cond_one", debug=debug_multi, likely=False) # main scheme pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Return(result_subnormal)), ConditionBlock( vx_one, Statement( ClearException(), Return(FP_PlusZero(self.precision)), ), ConditionBlock(exp_mone, Return(result2), Return(result)) #ConditionBlock(cond_one, #Return(new_result_one), #ConditionBlock(exp_mone, #Return(result2), #Return(result) #) #) )))) scheme = pre_scheme return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = Abs(self.implementation.add_input_variable("x", self.precision), tag = "vx") Log.report(Log.Info, "generating implementation scheme") if self.debug_flag: Log.report(Log.Info, "debug has been enabled") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) debug_precision = {ML_Binary32: debug_ftox, ML_Binary64: debug_lftolx}[self.precision] test_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = True, tag = "nan_or_inf") test_nan = Test(vx, specifier = Test.IsNaN, debug = True, tag = "is_nan_test") test_positive = Comparison(vx, 0, specifier = Comparison.GreaterOrEqual, debug = True, tag = "inf_sign") test_signaling_nan = Test(vx, specifier = Test.IsSignalingNaN, debug = True, tag = "is_signaling_nan") return_snan = Statement(ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement(ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock(test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input sollya_precision = self.precision.get_sollya_object() hi_precision = self.precision.get_field_size() - 3 # argument reduction frac_pi_index = 3 frac_pi = round(S2**frac_pi_index / pi, sollya_precision, sollya.RN) inv_frac_pi = round(pi / S2**frac_pi_index, hi_precision, sollya.RN) inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi, sollya_precision, sollya.RN) # computing k = E(x * frac_pi) vx_pi = Multiplication(vx, frac_pi, precision = self.precision) k = NearestInteger(vx_pi, precision = ML_Int32, tag = "k", debug = True) fk = Conversion(k, precision = self.precision, tag = "fk") inv_frac_pi_cst = Constant(inv_frac_pi, tag = "inv_frac_pi", precision = self.precision) inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo, tag = "inv_frac_pi_lo", precision = self.precision) red_vx_hi = (vx - inv_frac_pi_cst * fk) red_vx_hi.set_attributes(tag = "red_vx_hi", debug = debug_precision, precision = self.precision) red_vx_lo_sub = inv_frac_pi_lo_cst * fk red_vx_lo_sub.set_attributes(tag = "red_vx_lo_sub", debug = debug_precision, unbreakable = True, precision = self.precision) vx_d = Conversion(vx, precision = ML_Binary64, tag = "vx_d") pre_red_vx = red_vx_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d_hi = (vx_d - inv_frac_pi_cst * fk) pre_red_vx_d_hi.set_attributes(tag = "pre_red_vx_d_hi", precision = ML_Binary64, debug = debug_lftolx) pre_red_vx_d = pre_red_vx_d_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d.set_attributes(tag = "pre_red_vx_d", debug = debug_lftolx, precision = ML_Binary64) modk = Modulo(k, 2**(frac_pi_index+1), precision = ML_Int32, tag = "switch_value", debug = True) sel_c = Equal(BitLogicAnd(modk, 2**(frac_pi_index-1)), 2**(frac_pi_index-1)) red_vx = Select(sel_c, -pre_red_vx, pre_red_vx) red_vx.set_attributes(tag = "red_vx", debug = debug_precision, precision = self.precision) red_vx_d = Select(sel_c, -pre_red_vx_d, pre_red_vx_d) red_vx_d.set_attributes(tag = "red_vx_d", debug = debug_lftolx, precision = ML_Binary64) approx_interval = Interval(-pi/(S2**(frac_pi_index+1)), pi / S2**(frac_pi_index+1)) Log.report(Log.Info, "approx interval: %s\n" % approx_interval) error_goal_approx = S2**-self.precision.get_precision() Log.report(Log.Info, "building mathematical polynomial") poly_degree_vector = [None] * 2**(frac_pi_index+1) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme index_relative = [] poly_object_vector = [None] * 2**(frac_pi_index+1) for i in range(2**(frac_pi_index+1)): sub_func = cos(sollya.x+i*pi/S2**frac_pi_index) degree = int(sup(guessdegree(sub_func, approx_interval, error_goal_approx))) + 1 degree_list = range(degree+1) a_interval = approx_interval if i == 0: # ad-hoc, TODO: to be cleaned degree = 6 degree_list = range(0, degree+1, 2) elif i % 2**(frac_pi_index) == 2**(frac_pi_index-1): # for pi/2 and 3pi/2, an approx to sin=cos(pi/2+x) # must be generated degree_list = range(1, degree+1, 2) if i == 3 or i == 5 or i == 7 or i == 9: precision_list = [sollya.binary64] + [sollya.binary32] *(degree) else: precision_list = [sollya.binary32] * (degree+1) poly_degree_vector[i] = degree constraint = sollya.absolute delta = (2**(frac_pi_index - 3)) centered_i = (i % 2**(frac_pi_index)) - 2**(frac_pi_index-1) if centered_i < delta and centered_i > -delta and centered_i != 0: constraint = sollya.relative index_relative.append(i) Log.report(Log.Info, "generating approximation for %d/%d" % (i, 2**(frac_pi_index+1))) poly_object_vector[i], _ = Polynomial.build_from_approximation_with_error(sub_func, degree_list, precision_list, a_interval, constraint, error_function = error_function) # unified power map for red_sx^n upm = {} rel_error_list = [] poly_scheme_vector = [None] * (2**(frac_pi_index+1)) for i in range(2**(frac_pi_index+1)): poly_object = poly_object_vector[i] poly_precision = self.precision if i == 3 or i == 5 or i == 7 or i == 9: poly_precision = ML_Binary64 c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision = ML_Binary64) c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = self.precision) poly_hi = (c0 + c1 * red_vx) poly_hi.set_precision(ML_Binary64) red_vx_d_2 = red_vx_d * red_vx_d poly_scheme = poly_hi + red_vx_d_2 * polynomial_scheme_builder(poly_object.sub_poly(start_index = 2, offset = 2), red_vx, unified_precision = self.precision, power_map_ = upm) poly_scheme.set_attributes(unbreakable = True) elif i == 4: c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = ML_Binary64) poly_scheme = c1 * red_vx_d + polynomial_scheme_builder(poly_object.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) poly_scheme.set_precision(ML_Binary64) else: poly_scheme = polynomial_scheme_builder(poly_object, red_vx, unified_precision = poly_precision, power_map_ = upm) #if i == 3: # c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision = self.precision) # c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = self.precision) # poly_scheme = (c0 + c1 * red_vx) + polynomial_scheme_builder(poly_object.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) poly_scheme.set_attributes(tag = "poly_cos%dpi%d" % (i, 2**(frac_pi_index)), debug = debug_precision) poly_scheme_vector[i] = poly_scheme #try: if is_gappa_installed() and i == 3: opt_scheme = self.opt_engine.optimization_process(poly_scheme, self.precision, copy = True, fuse_fma = self.fuse_fma) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_scheme, tag_map) gappa_vx = Variable("red_vx", precision = self.precision, interval = approx_interval) cg_eval_error_copy_map = { tag_map["red_vx"]: gappa_vx, tag_map["red_vx_d"]: gappa_vx, }