def split_domain(starting_domain, slivers): in_domains = [starting_domain] # abs out_domains = list() for I in in_domains: if sollya.inf(I) < 0 and sollya.sup(I) > 0: out_domains.append(sollya.Interval(sollya.inf(I), 0)) out_domains.append(sollya.Interval(0, sollya.sup(I))) else: out_domains.append(I) in_domains = out_domains # k out_domains = list() while len(in_domains) > 0: I = in_domains.pop() #print("in: [{}, {}]".format(float(sollya.inf(I)), float(sollya.sup(I)))) unround_mult = I * n_invpi mult_low = sollya.floor(sollya.inf(unround_mult)) mult_high = sollya.floor(sollya.sup(unround_mult)) if mult_low == mult_high or (mult_low == -1 and mult_high == 0): #print(" accepted") out_domains.append(I) continue if sollya.sup(I) <= 0: divider_low = (mult_low + 1) * n_pi divider_high = divider_low - divider_low * 2**-53 else: divider_high = (mult_low + 1) * n_pi divider_low = divider_high - divider_high * 2**-53 lower_part = sollya.Interval(sollya.inf(I), divider_low) upper_part = sollya.Interval(divider_high, sollya.sup(I)) #print(" -> [{}, {}]".format(float(sollya.inf(lower_part)), float(sollya.sup(lower_part)))) #print(" -> [{}, {}]".format(float(sollya.inf(upper_part)), float(sollya.sup(upper_part)))) in_domains.append(lower_part) in_domains.append(upper_part) in_domains = out_domains # subdivide each section into 2**subd sections for _ in range(slivers): out_domains = list() for I in in_domains: mid = sollya.mid(I) out_domains.append(sollya.Interval(sollya.inf(I), mid)) out_domains.append(sollya.Interval(mid, sollya.sup(I))) in_domains = out_domains in_domains = set(in_domains) in_domains = sorted(in_domains, key=lambda x: float(sollya.inf(x))) in_domains = [ d for d in in_domains if sollya.inf(d) != sollya.sup(d) ] return in_domains
def split_domain(starting_domain, slivers): in_domains = [starting_domain] out_domains = list() while len(in_domains) > 0: I = in_domains.pop() unround_e = sollya.log2(I) e_low = sollya.floor(sollya.inf(unround_e)) e_high = sollya.floor(sollya.sup(unround_e)) #print("in: [{}, {}] ({}, {})".format(float(sollya.inf(I)), float(sollya.sup(I)), int(e_low), int(e_high))) if e_low == e_high: #print(" accepted") out_domains.append(I) continue e_range = sollya.Interval(e_low, e_low+1) I_range = 2**e_range for _ in range(100): mid = sollya.mid(I_range) e = sollya.floor(sollya.log2(mid)) if e == e_low: I_range = sollya.Interval(mid, sollya.sup(I_range)) else: I_range = sollya.Interval(sollya.inf(I_range), mid) divider_high = sollya.sup(I_range) divider_low = sollya.inf(I_range) lower_part = sollya.Interval(sollya.inf(I), divider_low) upper_part = sollya.Interval(divider_high, sollya.sup(I)) #print(" -> [{}, {}]".format(float(sollya.inf(lower_part)), float(sollya.sup(lower_part)))) #print(" -> [{}, {}]".format(float(sollya.inf(upper_part)), float(sollya.sup(upper_part)))) in_domains.append(upper_part) in_domains.append(lower_part) in_domains = out_domains # subdivide each section into 2**subd sections for _ in range(slivers): out_domains = list() for I in in_domains: mid = sollya.mid(I) out_domains.append(sollya.Interval(sollya.inf(I), mid)) out_domains.append(sollya.Interval(mid, sollya.sup(I))) in_domains = out_domains in_domains = set(in_domains) in_domains = sorted(in_domains, key=lambda x:float(sollya.inf(x))) in_domains = [d for d in in_domains if sollya.inf(d) != sollya.sup(d)] return in_domains
def solve_format_CLZ(optree): """ Legalize CountLeadingZeros precision Args: optree (CountLeadingZeros): input node Returns: ML_Format: legal format for CLZ """ assert isinstance(optree, CountLeadingZeros) op_input = optree.get_input(0) input_precision = op_input.get_precision() if is_fixed_point(input_precision): if input_precision.get_signed(): Log.report(Log.Warning , "signed format in solve_format_CLZ") # +1 for carry overflow int_size = int(sollya.floor(sollya.log2(input_precision.get_bit_size()))) + 1 frac_size = 0 return fixed_point( int_size, frac_size, signed=False ) else: Log.report(Log.Warning , "unsupported format in solve_format_CLZ") return optree.get_precision()
def get_lzc_output_width(width): """ Compute the size of a standard leading zero count result for a width-bit output @param width [int] input width @return output width (in bits) """ return int(floor(log2(width))) + 1
def get_integer_coding(self, value, language=C_Code): if FP_SpecialValue.is_special_value(value): return self.get_special_value_coding(value, language) elif value == ml_infty: return self.get_special_value_coding(FP_PlusInfty(self), language) elif value == -ml_infty: return self.get_special_value_coding(FP_MinusInfty(self), language) else: value = sollya.round(value, self.get_sollya_object(), sollya.RN) # FIXME: managing negative zero sign = int(1 if value < 0 else 0) value = abs(value) if value == 0.0: Log.report(Log.Warning, "+0.0 forced during get_integer_coding conversion") exp_biased = 0 mant = 0 else: exp = int(sollya.floor(sollya.log2(value))) exp_biased = int(exp - self.get_bias()) if exp < self.get_emin_normal(): exp_biased = 0 mant = int((value / S2**self.get_emin_subnormal())) else: mant = int( (value / S2**exp - 1.0) * (S2**self.get_field_size())) return mant | (exp_biased << self.get_field_size()) | ( sign << (self.get_field_size() + self.get_exponent_size()))
def round_sollya_object(self, value, round_mode=sollya.RN): rnd_function = { sollya.RN: sollya.nearestint, sollya.RD: sollya.floor, sollya.RU: sollya.ceil, sollya.RZ: lambda x: sollya.floor(x) if x > 0 \ else sollya.ceil(x) }[round_mode] scale_factor = S2**self.get_frac_size() return rnd_function(scale_factor * value) / scale_factor
def get_accuracy_from_epsilon(epsilon): """ convert a numerical relative error into a number of accuracy bits :param epsilon: error to convert :type epsilon: number :return: accuracy corresponding to the error :rtype: SollyaObject """ return sollya.floor(-sollya.log2(abs(epsilon)))
def get_fixed_type_from_interval(interval, precision): """ generate a fixed-point format which can encode @p interval without overflow, and which spans @p precision bits """ lo = inf(interval) hi = sup(interval) signed = True if lo < 0 else False msb_index = int(floor(sollya.log2(max(abs(lo), abs(hi))))) + 1 extra_digit = 1 if signed else 0 return fixed_point(msb_index + extra_digit, -(msb_index - precision), signed=signed)
def evaluate_argument_reduction(self, in_interval, in_prec, inv_size, inv_prec): one = Constant(1, precision = ML_Exact, tag = "one") dx = Variable("dx", precision = ML_Custom_FixedPoint_Format(0, in_prec, False), interval = in_interval) # do the argument reduction x = Addition(dx, one, tag = "x", precision = ML_Exact) x1 = Conversion(x, tag = "x1", precision = ML_Custom_FixedPoint_Format(0, inv_size, False), rounding_mode = ML_RoundTowardMinusInfty) s = Multiplication(dx, Constant(S2**inv_size, precision = ML_Exact), precision = ML_Exact, tag = "interval_index_table") inv_x1 = Division(one, x1, tag = "ix1", precision = ML_Exact) inv_x = Conversion(inv_x1, tag = "ix", precision = ML_Custom_FixedPoint_Format(1, inv_prec, False), rounding_mode = ML_RoundTowardPlusInfty) y = Multiplication(x, inv_x, tag = "y", precision = ML_Exact) dy = Subtraction(y, one, tag = "dy", precision = ML_Exact) # add the necessary goals and hints dx_gappa = Variable("dx_gappa", interval = dx.get_interval(), precision = dx.get_precision()) swap_map = {dx: dx_gappa} # goal: dz (result of the argument reduction) gappa_code = self.gappa_engine.get_interval_code_no_copy(dy.copy(swap_map), bound_list = [swap_map[dx]]) #self.gappa_engine.add_goal(gappa_code, s.copy(swap_map)) # range of index of table # hints. are the ones with isAppox=True really necessary ? self.gappa_engine.add_hint(gappa_code, x.copy(swap_map), x1.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, inv_x1.copy(swap_map), inv_x.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, Multiplication(x1, inv_x1, precision = ML_Exact).copy(swap_map), one, Comparison(swap_map[inv_x1], Constant(0, precision = ML_Exact), specifier = Comparison.NotEqual, precision = ML_Bool)) # execute and parse the result result = execute_gappa_script_extract(gappa_code.get(self.gappa_engine)) out_interval = result['goal'] length_table = 1 + floor(sup(in_interval) * S2**inv_size).getConstantAsInt() sizeof_table = length_table * (16 + ML_Custom_FixedPoint_Format(1, inv_prec, False).get_c_bit_size()/8) return { 'out_interval': out_interval, 'length_table': length_table, 'sizeof_table': sizeof_table, }
def generate_scheme(self): lzc_width = int(floor(log2(self.width))) + 1 Log.report(Log.Info, "width of lzc out is {}".format(lzc_width)) input_precision = ML_StdLogicVectorFormat(self.width) precision = ML_StdLogicVectorFormat(lzc_width) # declaring main input variable vx = self.implementation.add_input_signal("x", input_precision) vr_out = Signal("lzc", precision=precision, var_type=Variable.Local) tmp_lzc = Variable("tmp_lzc", precision=precision, var_type=Variable.Local) iterator = Variable("i", precision=ML_Integer, var_type=Variable.Local) lzc_loop = RangeLoop( iterator, Interval(0, self.width - 1), ConditionBlock( Comparison(VectorElementSelection(vx, iterator, precision=ML_StdLogic), Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), ReferenceAssign( tmp_lzc, Conversion(Subtraction(Constant(self.width - 1, precision=ML_Integer), iterator, precision=ML_Integer), precision=precision), )), specifier=RangeLoop.Increasing, ) lzc_process = Process(Statement( ReferenceAssign(tmp_lzc, Constant(self.width, precision=precision)), lzc_loop, ReferenceAssign(vr_out, tmp_lzc)), sensibility_list=[vx]) self.implementation.add_process(lzc_process) self.implementation.add_output_signal("vr_out", vr_out) return [self.implementation]
def generate_payne_hanek(vx, frac_pi, precision, n=100, k=4, chunk_num=None, debug=False): """ generate payne and hanek argument reduction for frac_pi * variable """ # determining integer format corresponding to # floating point precision argument int_precision = {ML_Binary64: ML_Int64, ML_Binary32: ML_Int32}[precision] cst_msb = floor(log2(abs(frac_pi))) cst_exp_range = cst_msb - precision.get_emin_subnormal() + 1 # chunk size has to be so than multiplication by a splitted <v> (vx_hi or vx_lo) # is exact chunk_size = 20 # precision.get_field_size() / 2 - 2 chunk_number = int(ceil((cst_exp_range + chunk_size - 1) / chunk_size)) scaling_factor = S2**-(chunk_size / 2) chunk_size_cst = Constant(chunk_size, precision=ML_Int32) cst_msb_node = Constant(cst_msb, precision=ML_Int32) p = precision.get_field_size() # adapting debug format to precision argument debug_precision = { ML_Binary32: debug_ftox, ML_Binary64: debug_lftolx }[precision] if debug else None # saving sollya's global precision old_global_prec = get_prec() prec(cst_exp_range + 100) # table to store chunk of constant multiplicand cst_table = ML_Table(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_cst_table") # table to store sqrt(scaling_factor) corresponding to the cst multiplicand chunks scale_table = ML_Table(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_scale_table") tmp_cst = frac_pi # this loop divide the digits of frac_pi into chunks # the chunk lsb weight is given by a shift from # cst_msb, multiple of the chunk index for i in xrange(chunk_number): value_div_factor = S2**(chunk_size * (i + 1) - cst_msb) local_cst = int(tmp_cst * value_div_factor) / value_div_factor local_scale = (scaling_factor**i) # storing scaled constant chunks cst_table[i][0] = local_cst / (local_scale**2) scale_table[i][0] = local_scale tmp_cst = tmp_cst - local_cst vx_exp = ExponentExtraction(vx) msb_exp = -vx_exp + p - 1 + k msb_exp.set_attributes(tag="msb_exp", debug=(debugd if debug else None)) msb_index = Select(cst_msb_node < msb_exp, 0, (cst_msb_node - msb_exp) / chunk_size_cst) msb_index.set_attributes(tag="msb_index", debug=(debugd if debug else None)) lsb_exp = -vx_exp + p - 1 - n lsb_exp.set_attributes(tag="lsb_exp", debug=(debugd if debug else None)) lsb_index = (cst_msb_node - lsb_exp) / chunk_size_cst lsb_index.set_attributes(tag="lsb_index", debug=(debugd if debug else None)) half_size = precision.get_field_size() / 2 + 1 vx_hi = TypeCast(BitLogicAnd( TypeCast(vx, precision=ML_Int64), Constant(~(2**half_size - 1), precision=ML_Int64)), precision=precision) vx_hi.set_attributes(tag="vx_hi", debug=debug_precision) vx_lo = vx - vx_hi vx_lo.set_attributes(tag="vx_lo", debug=debug_precision) vi = Variable("i", precision=ML_Int32, var_type=Variable.Local) half_scaling = Constant(S2**(-chunk_size / 2), precision=precision) i1 = Constant(1, precision=ML_Int32) acc = Variable("acc", precision=precision, var_type=Variable.Local) acc_int = Variable("acc_int", precision=int_precision, var_type=Variable.Local) init_loop = Statement( vx_hi, vx_lo, ReferenceAssign(vi, msb_index), ReferenceAssign(acc, Constant(0, precision=precision)), ReferenceAssign(acc_int, Constant(0, precision=precision)), ) cst_load = TableLoad(cst_table, vi, 0, tag="cst_load", debug=debug_precision) sca_load = TableLoad(scale_table, vi, 0, tag="sca_load", debug=debug_precision) hi_mult = (vx_hi * sca_load) * (cst_load * sca_load) hi_mult.set_attributes(tag="hi_mult", debug=debug_precision) pre_hi_mult_int = NearestInteger(hi_mult, precision=int_precision, tag="hi_mult_int", debug=(debuglld if debug else None)) hi_mult_int_f = Conversion(pre_hi_mult_int, precision=precision, tag="hi_mult_int_f", debug=debug_precision) pre_hi_mult_red = (hi_mult - hi_mult_int_f).modify_attributes( tag="hi_mult_red", debug=debug_precision) # for the first chunks (vx_hi * <constant chunk>) exceeds 2**k+1 and may be # discard (whereas it may lead to overflow during integer conversion pre_exclude_hi = ((cst_msb_node - (vi + i1) * chunk_size + i1) + (vx_exp + Constant(-half_size + 1, precision=ML_Int32)) ).modify_attributes(tag="pre_exclude_hi", debug=(debugd if debug else None)) pre_exclude_hi.propagate_precision(ML_Int32, [cst_msb_node, vi, vx_exp, i1]) Ck = Constant(k, precision=ML_Int32) exclude_hi = pre_exclude_hi <= Ck exclude_hi.set_attributes(tag="exclude_hi", debug=(debugd if debug else None)) hi_mult_red = Select(exclude_hi, pre_hi_mult_red, Constant(0, precision=precision)) hi_mult_int = Select(exclude_hi, pre_hi_mult_int, Constant(0, precision=int_precision)) lo_mult = (vx_lo * sca_load) * (cst_load * sca_load) lo_mult.set_attributes(tag="lo_mult", debug=debug_precision) lo_mult_int = NearestInteger(lo_mult, precision=int_precision, tag="lo_mult_int", debug=(debuglld if debug else None)) lo_mult_int_f = Conversion(lo_mult_int, precision=precision, tag="lo_mult_int_f", debug=debug_precision) lo_mult_red = (lo_mult - lo_mult_int_f).modify_attributes( tag="lo_mult_red", debug=debug_precision) acc_expr = (acc + hi_mult_red) + lo_mult_red int_expr = ((acc_int + hi_mult_int) + lo_mult_int) % 2**(k + 1) CF1 = Constant(1, precision=precision) CI1 = Constant(1, precision=int_precision) acc_expr_int = NearestInteger(acc_expr, precision=int_precision) normalization = Statement( ReferenceAssign( acc, acc_expr - Conversion(acc_expr_int, precision=precision)), ReferenceAssign(acc_int, int_expr + acc_expr_int), ) acc_expr.set_attributes(tag="acc_expr", debug=debug_precision) int_expr.set_attributes(tag="int_expr", debug=(debuglld if debug else None)) red_loop = Loop( init_loop, vi <= lsb_index, Statement( acc_expr, int_expr, normalization, #ReferenceAssign(acc, acc_expr), #ReferenceAssign(acc_int, int_expr), ReferenceAssign(vi, vi + 1))) result = Statement(lsb_index, msb_index, red_loop) # restoring sollya's global precision prec(old_global_prec) return result, acc, acc_int
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = VirtualFormat(base_format=self.precision, support_format=ML_StdLogicVectorFormat( self.precision.get_bit_size()), get_cst=get_virtual_cst) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) vx_precision = self.precision vy_precision = self.precision result_precision = self.precision # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # precision of output o = result_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p - 1) mant_vy_precision = ML_StdLogicVectorFormat(q - 1) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) exp_vx = RawExponentExtraction(vx, precision=exp_vx_precision) exp_vy = RawExponentExtraction(vy, precision=exp_vy_precision) # Maximum number of leading zero for normalized <vx> L_x = 0 # Maximum number of leading zero for normalized <vy> L_y = 0 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') effective_op = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() exp_offset = max(o + L_y, q) + 2 exp_bias = exp_offset + exp_vx_bias - exp_vy_bias # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size()) + 2 exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( Addition(zext( exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), Constant(exp_bias, precision=exp_precision_ext), precision=exp_precision_ext), zext(exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), precision=exp_precision_ext, tag="exp_diff", debug=debug_std) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = exp_offset + max(o + L_x, p) + 2 + q max_exp_diff = datapath_full_width - q exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) mant_ext_size = max_exp_diff shift_prec = ML_StdLogicVectorFormat(datapath_full_width) shifted_mant_vy = BitLogicRightShift(rzext(mant_vy, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_mant_vy", debug=debug_std) # vx is right-extended by q+2 bits # and left extend by exp_offset mant_vx_ext = zext(rzext(mant_vx, q + 2), exp_offset + 1) add_prec = ML_StdLogicVectorFormat(datapath_full_width + 1) mant_vx_add_op = Select(Comparison(effective_op, Constant(1, precision=ML_StdLogic), precision=ML_Bool, specifier=Comparison.Equal), Negation(mant_vx_ext, precision=add_prec, tag="neg_mant_vx"), mant_vx_ext, precision=add_prec, tag="mant_vx_add_op", debug=ML_Debug(display_format=" ")) mant_add = Addition(zext(shifted_mant_vy, 1), mant_vx_add_op, precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd(CopySign(mant_add, precision=ML_StdLogic), effective_op, precision=ML_StdLogic, tag="add_is_negative", debug=ML_Debug(" -radix 2")) # Negate mantissa addition result if it is negative mant_add_abs = Select(Comparison(add_is_negative, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(mant_add, precision=add_prec, tag="neg_mant_add", debug=debug_std), mant_add, precision=add_prec, tag="mant_add_abs", debug=debug_std) res_sign = BitLogicXor(add_is_negative, sign_vy, precision=ML_StdLogic, tag="res_sign") # Precision for leading zero count lzc_width = int(floor(log2(datapath_full_width + 1)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) lzc_args = ML_LeadingZeroCounter.get_default_args( width=(datapath_full_width + 1)) LZC_entity = ML_LeadingZeroCounter(lzc_args) lzc_entity_list = LZC_entity.generate_scheme() lzc_implementation = LZC_entity.get_implementation() lzc_component = lzc_implementation.get_component_object() #lzc_in = SubSignalSelection(mant_add, p+1, 2*p+3) lzc_in = mant_add_abs # SubSignalSelection(mant_add_abs, 0, 3*p+3, precision = ML_StdLogicVectorFormat(3*p+4)) add_lzc = Signal("add_lzc", precision=lzc_prec, var_type=Signal.Local, debug=debug_dec) add_lzc = PlaceHolder( add_lzc, lzc_component(io_map={ "x": lzc_in, "vr_out": add_lzc })) # Index of output mantissa least significant bit mant_lsb_index = datapath_full_width - o + 1 #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add_abs, add_lzc, precision=add_prec, tag="res_normed_mant", debug=debug_std) pre_mant_field = SubSignalSelection( res_normed_mant, mant_lsb_index, datapath_full_width - 1, precision=ML_StdLogicVectorFormat(o - 1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision=ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision=ML_Integer) round_bit = BitExtraction(res_normed_mant, IntCst(mant_lsb_index - 1)) mant_lsb = BitExtraction(res_normed_mant, IntCst(mant_lsb_index)) sticky_prec = ML_StdLogicVectorFormat(datapath_full_width - o) sticky_input = SubSignalSelection(res_normed_mant, 0, datapath_full_width - o - 1, precision=sticky_prec) sticky_bit = Select(Comparison(sticky_input, Constant(0, precision=sticky_prec), specifier=Comparison.NotEqual, precision=ML_Bool), Constant(1, precision=ML_StdLogic), Constant(0, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd(round_bit, BitLogicOr(sticky_bit, mant_lsb, precision=ML_StdLogic), precision=ML_StdLogic, tag="round_increment_RN", debug=debug_std) rounded_mant = Addition(zext(pre_mant_field, 1), round_increment_RN, precision=ML_StdLogicVectorFormat(o), tag="rounded_mant", debug=debug_std) rounded_overflow = BitExtraction(rounded_mant, IntCst(o - 1), tag="rounded_overflow", debug=debug_std) res_mant_field = Select(Comparison(rounded_overflow, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), SubSignalSelection(rounded_mant, 1, o - 1), SubSignalSelection(rounded_mant, 0, o - 2), precision=ML_StdLogicVectorFormat(o - 1), tag="final_mant", debug=debug_std) res_exp_tmp_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size()) + 2 res_exp_tmp_prec = ML_StdLogicVectorFormat(res_exp_tmp_size) exp_vy_biased = Addition(zext( exp_vy, res_exp_tmp_size - vy_precision.get_exponent_size()), Constant(vy_precision.get_bias() + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vy_biased", debug=debug_dec) # vx's exponent is biased with the format bias # plus the exponent offset so it is left align to datapath MSB exp_vx_biased = Addition( zext(exp_vx, res_exp_tmp_size - vx_precision.get_exponent_size()), Constant(vx_precision.get_bias() + exp_offset + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vx_biased", debug=debug_dec) # If exp diff is less than 0, then we must consider that vy's exponent is # the meaningful one and thus compute result exponent with respect # to vy's exponent value res_exp_base = Select(exp_diff_lt_0, exp_vy_biased, exp_vx_biased, precision=res_exp_tmp_prec, tag="res_exp_base", debug=debug_dec) # Eventually we add the result exponent base # with the exponent offset and the leading zero count res_exp_ext = Addition(Subtraction( Addition(zext(res_exp_base, 0), Constant(-result_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec), zext(add_lzc, res_exp_tmp_size - lzc_width), precision=res_exp_tmp_prec), rounded_overflow, precision=res_exp_tmp_prec, tag="res_exp_ext", debug=debug_std) res_exp_prec = ML_StdLogicVectorFormat( result_precision.get_exponent_size()) res_exp = Truncate(res_exp_ext, precision=res_exp_prec, tag="res_exp", debug=debug_dec_unsigned) vr_out = TypeCast(FloatBuild( res_sign, res_exp, res_mant_field, precision=self.precision, ), precision=io_precision, tag="result", debug=debug_std) self.implementation.add_output_signal("vr_out", vr_out) return lzc_entity_list + [self.implementation]
def generate_scheme(self): ## Generate Fused multiply and add comput <x> . <y> + <z> Log.report( Log.Info, "generating MPFMA with acc precision {acc_precision} and precision {precision}" .format(acc_precision=self.acc_precision, precision=self.precision)) def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision prod_input_precision = self.precision accumulator_precision = self.acc_precision # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", prod_input_precision) vy = self.implementation.add_input_signal("y", prod_input_precision) vz = self.implementation.add_input_signal("z", accumulator_precision) # extra reset input port reset = self.implementation.add_input_signal("reset", ML_StdLogic) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() vx_precision = self.precision.get_base_format() vy_precision = self.precision.get_base_format() vz_precision = self.acc_precision.get_base_format() result_precision = self.acc_precision.get_base_format() # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # precision for r = vz_precision.get_mantissa_size() # precision of output o = result_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) exp_vz_precision = ML_StdLogicVectorFormat( vz_precision.get_exponent_size()) # MantissaExtraction performs the implicit # digit computation and concatenation mant_vx_precision = ML_StdLogicVectorFormat(p) mant_vy_precision = ML_StdLogicVectorFormat(q) mant_vz_precision = ML_StdLogicVectorFormat(r) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) mant_vz = MantissaExtraction(vz, precision=mant_vz_precision) exp_vx = RawExponentExtraction(vx, precision=exp_vx_precision) exp_vy = RawExponentExtraction(vy, precision=exp_vy_precision) exp_vz = RawExponentExtraction(vz, precision=exp_vz_precision) # Maximum number of leading zero for normalized <vx> mantissa L_x = 0 # Maximum number of leading zero for normalized <vy> mantissa L_y = 0 # Maximum number of leading zero for normalized <vz> mantissa L_z = 0 # Maximum number of leading zero for the product of <x>.<y> # mantissa. L_xy = L_x + L_y + 1 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) sign_vz = CopySign(vz, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') sign_xy = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="sign_xy", debug=debug_std) effective_op = BitLogicXor(sign_xy, sign_vz, precision=ML_StdLogic, tag="effective_op", debug=debug_std) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() exp_vz_bias = vz_precision.get_bias() # x.y is statically positionned in the datapath # while z is shifted # This is justified by the fact that z alignment may be performed # in parallel with the multiplication of x and y mantissas # The product is positionned <exp_offset>-bit to the right of datapath MSB # (without including an extra carry bit) exp_offset = max(o + L_z, r) + 2 exp_bias = exp_offset + (exp_vx_bias + exp_vy_bias) - exp_vz_bias # because of the mantissa range [1, 2[, the product exponent # is located one bit to the right (lower) of the product MSB prod_exp_offset = 1 # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), vz_precision.get_exponent_size()) + 2 exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = UnsignedSubtraction( UnsignedAddition(UnsignedAddition( zext(exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), zext(exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), precision=exp_precision_ext), Constant(exp_bias + prod_exp_offset, precision=exp_precision_ext), precision=exp_precision_ext), zext(exp_vz, exp_precision_ext_size - vz_precision.get_exponent_size()), precision=exp_precision_ext, tag="exp_diff", debug=debug_std) exp_precision_ext_signed = get_signed_precision(exp_precision_ext) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext_signed) datapath_full_width = exp_offset + max(o + L_xy, p + q) + 2 + r max_exp_diff = datapath_full_width - r exp_diff_lt_0 = Comparison(signed_exp_diff, Constant( 0, precision=exp_precision_ext_signed), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison( signed_exp_diff, Constant(max_exp_diff, precision=exp_precision_ext_signed), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=debug_dec) prod_prec = ML_StdLogicVectorFormat(p + q) prod = UnsignedMultiplication(mant_vx, mant_vy, precision=prod_prec, tag="prod", debug=debug_std) mant_ext_size = max_exp_diff shift_prec = ML_StdLogicVectorFormat(datapath_full_width) mant_vz_ext = rzext(mant_vz, mant_ext_size) shifted_mant_vz = BitLogicRightShift(mant_vz_ext, mant_shift, precision=shift_prec, tag="shifted_mant_vz", debug=debug_std) # Inserting pipeline stage # after production computation # and addend alignment shift if self.pipelined: self.implementation.start_new_stage() # vx is right-extended by q+2 bits # and left extend by exp_offset prod_ext = zext(rzext(prod, r + 2), exp_offset + 1) add_prec = ML_StdLogicVectorFormat(datapath_full_width + 1) ## Here we make the supposition that # the product is slower to compute than # aligning <vz> and negating it if necessary # which means that mant_add as the same sign as the product #prod_add_op = Select( # Comparison( # effective_op, # Constant(1, precision = ML_StdLogic), # precision = ML_Bool, # specifier = Comparison.Equal # ), # Negation(prod_ext, precision = add_prec, tag = "neg_prod"), # prod_ext, # precision = add_prec, # tag = "prod_add_op", # debug = debug_cst_dec #) addend_op = Select(Comparison(effective_op, Constant(1, precision=ML_StdLogic), precision=ML_Bool, specifier=Comparison.Equal), BitLogicNegate(zext(shifted_mant_vz, 1), precision=add_prec, tag="neg_addend_Op"), zext(shifted_mant_vz, 1), precision=add_prec, tag="addend_op", debug=debug_std) prod_add_op = prod_ext # Compound Addition mant_add_p1 = UnsignedAddition(UnsignedAddition(addend_op, prod_add_op, precision=add_prec), Constant(1, precision=ML_StdLogic), precision=add_prec, tag="mant_add_p1", debug=debug_std) mant_add_p0 = UnsignedAddition(addend_op, prod_add_op, precision=add_prec, tag="mant_add_p0", debug=debug_std) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd(CopySign(mant_add_p1, precision=ML_StdLogic), effective_op, precision=ML_StdLogic, tag="add_is_negative", debug=debug_std) # Negate mantissa addition result if it is negative mant_add_abs = Select(Comparison(add_is_negative, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), BitLogicNegate(mant_add_p0, precision=add_prec, tag="neg_mant_add_p0", debug=debug_std), mant_add_p1, precision=add_prec, tag="mant_add_abs", debug=debug_std) # determining result sign, mant_add # as the same sign as the product res_sign = BitLogicXor(add_is_negative, sign_xy, precision=ML_StdLogic, tag="res_sign") # adding pipeline stage after addition computation if self.pipelined: self.implementation.start_new_stage() # Precision for leading zero count lzc_width = int(floor(log2(datapath_full_width + 1)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) current_stage = self.implementation.get_current_stage() lzc_args = ML_LeadingZeroCounter.get_default_args( width=(datapath_full_width + 1)) LZC_entity = ML_LeadingZeroCounter(lzc_args) lzc_entity_list = LZC_entity.generate_scheme() lzc_implementation = LZC_entity.get_implementation() lzc_component = lzc_implementation.get_component_object() #self.implementation.set_current_stage(current_stage) # Attributes dynamic field (init_stage and init_op) # constructors must be initialized back after # building a sub-operator inside this operator self.implementation.instanciate_dyn_attributes() # lzc_in = mant_add_abs add_lzc_sig = Signal("add_lzc", precision=lzc_prec, var_type=Signal.Local, debug=debug_dec) add_lzc = PlaceHolder(add_lzc_sig, lzc_component(io_map={ "x": mant_add_abs, "vr_out": add_lzc_sig }, tag="lzc_i"), tag="place_holder") # adding pipeline stage after leading zero count if self.pipelined: self.implementation.start_new_stage() # Index of output mantissa least significant bit mant_lsb_index = datapath_full_width - o + 1 #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add_abs, add_lzc, precision=add_prec, tag="res_normed_mant", debug=debug_std) pre_mant_field = SubSignalSelection( res_normed_mant, mant_lsb_index, datapath_full_width - 1, precision=ML_StdLogicVectorFormat(o - 1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision=ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision=ML_Integer) # adding pipeline stage after normalization shift if self.pipelined: self.implementation.start_new_stage() round_bit = BitExtraction(res_normed_mant, IntCst(mant_lsb_index - 1)) mant_lsb = BitExtraction(res_normed_mant, IntCst(mant_lsb_index)) sticky_prec = ML_StdLogicVectorFormat(datapath_full_width - o) sticky_input = SubSignalSelection(res_normed_mant, 0, datapath_full_width - o - 1, precision=sticky_prec) sticky_bit = Select(Comparison(sticky_input, Constant(0, precision=sticky_prec), specifier=Comparison.NotEqual, precision=ML_Bool), Constant(1, precision=ML_StdLogic), Constant(0, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd(round_bit, BitLogicOr(sticky_bit, mant_lsb, precision=ML_StdLogic), precision=ML_StdLogic, tag="round_increment_RN", debug=debug_std) rounded_mant = UnsignedAddition(zext(pre_mant_field, 1), round_increment_RN, precision=ML_StdLogicVectorFormat(o), tag="rounded_mant", debug=debug_std) rounded_overflow = BitExtraction(rounded_mant, IntCst(o - 1), tag="rounded_overflow", debug=debug_std) res_mant_field = Select(Comparison(rounded_overflow, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), SubSignalSelection(rounded_mant, 1, o - 1), SubSignalSelection(rounded_mant, 0, o - 2), precision=ML_StdLogicVectorFormat(o - 1), tag="final_mant", debug=debug_std) res_exp_tmp_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), vz_precision.get_exponent_size()) + 2 res_exp_tmp_prec = ML_StdLogicVectorFormat(res_exp_tmp_size) # Product biased exponent # is computed from both x and y exponent exp_xy_biased = UnsignedAddition(UnsignedAddition( UnsignedAddition(zext( exp_vy, res_exp_tmp_size - vy_precision.get_exponent_size()), Constant(vy_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vy_biased", debug=debug_dec), UnsignedAddition(zext( exp_vx, res_exp_tmp_size - vx_precision.get_exponent_size()), Constant(vx_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vx_biased", debug=debug_dec), precision=res_exp_tmp_prec), Constant( exp_offset + 1, precision=res_exp_tmp_prec, ), precision=res_exp_tmp_prec, tag="exp_xy_biased", debug=debug_dec) # vz's exponent is biased with the format bias # plus the exponent offset so it is left align to datapath MSB exp_vz_biased = UnsignedAddition( zext(exp_vz, res_exp_tmp_size - vz_precision.get_exponent_size()), Constant( vz_precision.get_bias() + 1, # + exp_offset + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vz_biased", debug=debug_dec) # If exp diff is less than 0, then we must consider that vz's exponent is # the meaningful one and thus compute result exponent with respect # to vz's exponent value res_exp_base = Select(exp_diff_lt_0, exp_vz_biased, exp_xy_biased, precision=res_exp_tmp_prec, tag="res_exp_base", debug=debug_dec) # Eventually we add the result exponent base # with the exponent offset and the leading zero count res_exp_ext = UnsignedAddition(UnsignedSubtraction( UnsignedAddition(zext(res_exp_base, 0), Constant(-result_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec), zext(add_lzc, res_exp_tmp_size - lzc_width), precision=res_exp_tmp_prec), rounded_overflow, precision=res_exp_tmp_prec, tag="res_exp_ext", debug=debug_std) res_exp_prec = ML_StdLogicVectorFormat( result_precision.get_exponent_size()) res_exp = Truncate(res_exp_ext, precision=res_exp_prec, tag="res_exp", debug=debug_dec_unsigned) vr_out = TypeCast(FloatBuild( res_sign, res_exp, res_mant_field, precision=accumulator_precision, ), precision=accumulator_precision, tag="result", debug=debug_std) # adding pipeline stage after rouding if self.pipelined: self.implementation.start_new_stage() self.implementation.add_output_signal("vr_out", vr_out) return lzc_entity_list + [self.implementation]
def generate_scheme(self): ## Generate Fused multiply and add comput <x> . <y> + <z> Log.report( Log.Info, "generating fixed MPFMA with {ed} extra digit(s) and sign-magnitude accumulator: {sm}" .format(ed=self.extra_digit, sm=self.sign_magnitude)) def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = HdlVirtualFormat(self.precision) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable # maximum weigth for a mantissa product digit max_prod_exp = self.precision.get_emax() * 2 + 1 # minimum wieght for a mantissa product digit min_prod_exp = self.precision.get_emin_subnormal() * 2 ## Most and least significant digit index for the # accumulator acc_msb_index = max_prod_exp + self.extra_digit acc_lsb_index = min_prod_exp acc_width = acc_msb_index - min_prod_exp + 1 # precision of the accumulator acc_prec = ML_StdLogicVectorFormat(acc_width) reset = self.implementation.add_input_signal("reset", ML_StdLogic) vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() acc = self.implementation.add_input_signal("acc", acc_prec) if self.sign_magnitude: # the accumulator is in sign-magnitude representation sign_acc = self.implementation.add_input_signal( "sign_acc", ML_StdLogic) else: sign_acc = CopySign(acc, precision=ML_StdLogic, tag="sign_acc", debug=debug_std) vx_precision = self.precision vy_precision = self.precision result_precision = acc_prec # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p - 1) mant_vy_precision = ML_StdLogicVectorFormat(q - 1) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) exp_vx = ExponentExtraction(vx, precision=exp_vx_precision, tag="exp_vx", debug=debug_dec) exp_vy = ExponentExtraction(vy, precision=exp_vy_precision, tag="exp_vy", debug=debug_dec) # Maximum number of leading zero for normalized <vx> mantissa L_x = 0 # Maximum number of leading zero for normalized <vy> mantissa L_y = 0 # Maximum number of leading zero for the product of <x>.<y> # mantissa. L_xy = L_x + L_y + 1 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') sign_xy = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="sign_xy", debug=ML_Debug(display_format="-radix 2")) effective_op = BitLogicXor(sign_xy, sign_acc, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() # <acc> is statically positionned in the datapath, # it may even constitute the whole datapath # # the product is shifted with respect to the fix accumulator exp_bias = (exp_vx_bias + exp_vy_bias) # because of the mantissa range [1, 2[, the product exponent # is located one bit to the right (lower) of the product MSB prod_exp_offset = 1 # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max( vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), abs(ceil(log2(abs(acc_msb_index)))), abs(ceil(log2(abs(acc_lsb_index)))), abs(ceil(log2(abs(exp_bias + prod_exp_offset)))), ) + 2 Log.report(Log.Info, "exp_precision_ext_size={}".format(exp_precision_ext_size)) exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # static accumulator exponent exp_acc = Constant(acc_msb_index, precision=exp_precision_ext, tag="exp_acc", debug=debug_cst_dec) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( exp_acc, Addition(Addition(zext( exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), zext( exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), precision=exp_precision_ext), Constant(exp_bias + prod_exp_offset, precision=exp_precision_ext, tag="diff_bias", debug=debug_cst_dec), precision=exp_precision_ext, tag="pre_exp_diff", debug=debug_dec), precision=exp_precision_ext, tag="exp_diff", debug=debug_dec) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = acc_width # the maximum exp diff is the size of the datapath # minus the bit size of the product max_exp_diff = datapath_full_width - (p + q) exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) prod_prec = ML_StdLogicVectorFormat(p + q) prod = Multiplication(mant_vx, mant_vy, precision=prod_prec, tag="prod", debug=debug_std) # attempt at pipelining the operator # self.implementation.start_new_stage() mant_ext_size = datapath_full_width - (p + q) shift_prec = ML_StdLogicVectorFormat(datapath_full_width) shifted_prod = BitLogicRightShift(rzext(prod, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_prod", debug=debug_std) ## Inserting a pipeline stage after the product shifting if self.pipelined: self.implementation.start_new_stage() if self.sign_magnitude: # the accumulator is in sign-magnitude representation acc_negated = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), acc, BitLogicNegate(acc, precision=acc_prec), precision=acc_prec) # one extra MSB bit is added to the final addition # to detect overflows add_width = acc_width + 1 add_prec = ML_StdLogicVectorFormat(add_width) # FIXME: implement with a proper compound adder mant_add_p0_ext = Addition(zext(shifted_prod, 1), zext(acc_negated, 1), precision=add_prec) mant_add_p1_ext = Addition( mant_add_p0_ext, Constant(1, precision=ML_StdLogic), precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # discarding carry overflow bit mant_add_p0 = SubSignalSelection(mant_add_p0_ext, 0, acc_width - 1, precision=acc_prec) mant_add_p1 = SubSignalSelection(mant_add_p1_ext, 0, acc_width - 1, precision=acc_prec) mant_add_pre_sign = CopySign(mant_add_p1_ext, precision=ML_StdLogic, tag="mant_add_pre_sign", debug=debug_std) mant_add = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), mant_add_p0, Select( Comparison(mant_add_pre_sign, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), mant_add_p1, BitLogicNegate(mant_add_p0, precision=acc_prec), precision=acc_prec, ), precision=acc_prec, tag="mant_add") # if both operands had the same sign, then # mant_add is necessarily positive and the result # sign matches the input sign # if both operands had opposite signs, then # the result sign matches the product sign # if mant_add is positive, else the accumulator sign output_sign = Select( Comparison(effective_op, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), # if the effective op is a subtraction (prod - acc) BitLogicXor(sign_acc, mant_add_pre_sign, precision=ML_StdLogic), # the effective op is an addition, thus result and # acc share sign sign_acc, precision=ML_StdLogic, tag="output_sign") if self.pipelined: self.implementation.start_new_stage() # adding output self.implementation.add_output_signal("vr_sign", output_sign) self.implementation.add_output_signal("vr_acc", mant_add) else: # 2s complement encoding of the accumulator, # the accumulator is never negated, only the producted # is negated if negative # negate shifted prod when required shifted_prod_op = Select(Comparison(sign_xy, Constant( 1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(shifted_prod, precision=shift_prec), shifted_prod, precision=shift_prec) add_prec = shift_prec # ML_StdLogicVectorFormat(datapath_full_width + 1) mant_add = Addition(shifted_prod_op, acc, precision=acc_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) if self.pipelined: self.implementation.start_new_stage() self.implementation.add_output_signal("vr_acc", mant_add) return [self.implementation]
def __init__(self, precision = ML_Binary32, abs_accuracy = S2**-24, libm_compliant = True, debug_flag = False, fuse_fma = True, fast_path_extract = True, target = GenericProcessor(), output_file = "expf.c", function_name = "expf"): # declaring target and instantiating optimization engine processor = target self.precision = precision opt_eng = OptimizationEngine(processor) gappacg = GappaCodeGenerator(processor, declare_cst = True, disable_debug = True) # declaring CodeFunction and retrieving input variable self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = exp_implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = True, tag = "nan_or_inf") test_nan = Test(vx, specifier = Test.IsNaN, debug = True, tag = "is_nan_test") test_positive = Comparison(vx, 0, specifier = Comparison.GreaterOrEqual, debug = True, tag = "inf_sign") test_signaling_nan = Test(vx, specifier = Test.IsSignalingNaN, debug = True, tag = "is_signaling_nan") return_snan = Statement(ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement(ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock(test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely = False, specifier = Comparison.Greater) early_overflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2 ** precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely = False, specifier = Comparison.Less) early_underflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value = FP_PlusZero(self.precision))) sollya_prec_map = {ML_Binary32: sollya.binary32, ML_Binary64: sollya.binary64} # constant computation invlog2 = round(1/log(2), sollya_prec_map[self.precision], RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - (ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: "), log2_hi_precision invlog2_cst = Constant(invlog2, precision = self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = round(log(2) - log2_hi, sollya_prec_map[self.precision], sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag = "unround_k", debug = ML_Debug(display_format = "%f")) k = NearestInteger(unround_k, precision = self.precision, debug = ML_Debug(display_format = "%f")) ik = NearestInteger(unround_k, precision = ML_Int32, debug = ML_Debug(display_format = "%d"), tag = "ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact= True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact = True) r = exact_hi_part - k * log2_lo r.set_tag("r") r.set_attributes(debug = ML_Debug(display_format = "%f")) opt_r = opt_eng.optimization_process(r, self.precision, copy = True, fuse_fma = fuse_fma) tag_map = {} opt_eng.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision = self.precision, interval = interval_vx), tag_map["k"]: Variable("k", interval = interval_k, precision = self.precision) } #try: if 1: #eval_error = gappacg.get_eval_error(opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") eval_error = gappacg.get_eval_error_v2(opt_eng, opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") Log.report(Log.Info, "eval error: %s" % eval_error) #except: # Log.report(Log.Info, "gappa error evaluation failed") print r.get_str(depth = None, display_precision = True, display_attribute = True) print opt_r.get_str(depth = None, display_precision = True, display_attribute = True) approx_interval = Interval(-log(2)/2, log(2)/2) local_ulp = sup(ulp(exp(approx_interval), self.precision)) print "ulp: ", local_ulp error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = sup(guessdegree(exp(x), approx_interval, error_goal_approx)) #- 1 init_poly_degree = poly_degree return while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(x), poly_degree, [self.precision]*(poly_degree+1), approx_interval, absolute) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report(Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, r, unified_precision = self.precision) poly.set_tag("poly") # optimizing poly before evaluation error computation opt_poly = opt_eng.optimization_process(poly, self.precision) #print "poly: ", poly.get_str(depth = None, display_precision = True) #print "opt_poly: ", opt_poly.get_str(depth = None, display_precision = True) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision = self.precision, interval = approx_interval) poly_error_copy_map = { r.get_handle().get_node(): r_gappa_var } gappacg = GappaCodeGenerator(target, declare_cst = False, disable_debug = True) poly_eval_error = gappacg.get_eval_error_v2(opt_eng, poly.get_handle().get_node(), poly_error_copy_map, gappa_filename = "gappa_poly.g") Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) global_poly_error = poly_eval_error + poly_approx_error global_rel_poly_error = global_poly_error / exp(approx_interval) print "global_poly_error: ", global_poly_error, global_rel_poly_error flag = local_ulp > sup(abs(global_rel_poly_error)) print "test: ", flag if flag: break else: if poly_degree > init_poly_degree + 5: Log.report(Log.Error, "poly degree search did not converge") poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier = Comparison.Greater, likely = False, debug = True, tag = "late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = ik - overflow_exp_offset diff_k.set_attributes(debug = ML_Debug(display_format = "%d"), tag = "diff_k") late_overflow_result = (ExponentInsertion(diff_k) * poly) * ExponentInsertion(overflow_exp_offset) late_overflow_result.set_attributes(silent = False, tag = "late_overflow_result", debug = debugf) late_overflow_return = ConditionBlock(Test(late_overflow_result, specifier = Test.IsInfty, likely = False), ExpRaiseReturn(ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision)), Return(late_overflow_result)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier = Comparison.LessOrEqual, likely = False) underflow_exp_offset = 2 * self.precision.get_field_size() late_underflow_result = (ExponentInsertion(ik + underflow_exp_offset) * poly) * ExponentInsertion(-underflow_exp_offset) late_underflow_result.set_attributes(debug = ML_Debug(display_format = "%e"), tag = "late_underflow_result", silent = False) test_subnormal = Test(late_underflow_result, specifier = Test.IsSubnormal) late_underflow_return = Statement(ConditionBlock(test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value = late_underflow_result)), Return(late_underflow_result)) std_result = poly * ExponentInsertion(ik, tag = "exp_ik", debug = debug_lftolx) std_result.set_attributes(tag = "std_result", debug = debug_lftolx) result_scheme = ConditionBlock(late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result))) std_return = ConditionBlock(early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock(test_nan_or_inf, Statement(ClearException(), specific_return), std_return) #print scheme.get_str(depth = None, display_precision = True) # fusing FMA if fuse_fma: Log.report(Log.Info, "\033[33;1m MDL fusing FMA \033[0m") scheme = opt_eng.fuse_multiply_add(scheme, silence = True) Log.report(Log.Info, "\033[33;1m MDL abstract scheme \033[0m") opt_eng.instantiate_abstract_precision(scheme, None) Log.report(Log.Info, "\033[33;1m MDL instantiated scheme \033[0m") opt_eng.instantiate_precision(scheme, default_precision = self.precision) Log.report(Log.Info, "\033[33;1m subexpression sharing \033[0m") opt_eng.subexpression_sharing(scheme) Log.report(Log.Info, "\033[33;1m silencing operation \033[0m") opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) # check processor support Log.report(Log.Info, "\033[33;1m checking processor support \033[0m") opt_eng.check_processor_support(scheme) # factorizing fast path if fast_path_extract: Log.report(Log.Info, "\033[33;1m factorizing fast path\033[0m") opt_eng.factorize_fast_path(scheme) Log.report(Log.Info, "\033[33;1m generating source code \033[0m") cg = CCodeGenerator(processor, declare_cst = False, disable_debug = not debug_flag, libm_compliant = libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst = True) #self.result.add_header("support_lib/ml_types.h") self.result.add_header("support_lib/ml_special_values.h") self.result.add_header_comment("polynomial degree for exp(x): %d" % poly_degree) self.result.add_header_comment("sollya polynomial for exp(x): %s" % poly_object.get_sollya_object()) if debug_flag: self.result.add_header("stdio.h") self.result.add_header("inttypes.h") output_stream = open(output_file, "w")#"%s.c" % exp_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def generate_payne_hanek(vx, frac_pi, precision, n=100, k=4, chunk_num=None, debug=False): """ generate payne and hanek argument reduction for frac_pi * variable """ sollya.roundingwarnings = sollya.off debug_precision = debug_multi int_precision = {ML_Binary32: ML_Int32, ML_Binary64: ML_Int64}[precision] p = precision.get_field_size() # weight of the most significant digit of the constant cst_msb = floor(log2(abs(frac_pi))) # length of exponent range which must be covered by the approximation # of the constant cst_exp_range = cst_msb - precision.get_emin_subnormal() + 1 # chunk size has to be so than multiplication by a splitted <v> # (vx_hi or vx_lo) is exact chunk_size = precision.get_field_size() / 2 - 2 chunk_number = int(ceil((cst_exp_range + chunk_size - 1) / chunk_size)) scaling_factor = S2**-(chunk_size / 2) chunk_size_cst = Constant(chunk_size, precision=ML_Int32) cst_msb_node = Constant(cst_msb, precision=ML_Int32) # Saving sollya's global precision old_global_prec = sollya.settings.prec sollya.settings.prec(cst_exp_range + n) # table to store chunk of constant multiplicand cst_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_cst_table") # table to store sqrt(scaling_factor) corresponding to the # cst multiplicand chunks scale_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_scale_table") tmp_cst = frac_pi # cst_table stores normalized constant chunks (they have been # scale back to close to 1.0 interval) # # scale_table stores the scaling factors corresponding to the # denormalization of cst_table coefficients # this loop divide the digits of frac_pi into chunks # the chunk lsb weight is given by a shift from # cst_msb, multiple of the chunk index for i in range(chunk_number): value_div_factor = S2**(chunk_size * (i + 1) - cst_msb) local_cst = int(tmp_cst * value_div_factor) / value_div_factor local_scale = (scaling_factor**i) # storing scaled constant chunks cst_table[i][0] = local_cst / (local_scale**2) scale_table[i][0] = local_scale # Updating constant value tmp_cst = tmp_cst - local_cst # Computing which part of the constant we do not need to multiply # In the following comments, vi represents the bit of frac_pi of weight 2**-i # Bits vi so that i <= (vx_exp - p + 1 -k) are not needed, because they result # in a multiple of 2pi and do not contribute to trig functions. vx_exp = ExponentExtraction( vx, precision=vx.get_precision().get_integer_format()) vx_exp = Conversion(vx_exp, precision=ML_Int32) msb_exp = -(vx_exp - p + 1 - k) msb_exp.set_attributes(tag="msb_exp", debug=debug_multi) msb_exp = Conversion(msb_exp, precision=ML_Int32) # Select the highest index where the reduction should start msb_index = Select(cst_msb_node < msb_exp, 0, (cst_msb_node - msb_exp) / chunk_size_cst) msb_index.set_attributes(tag="msb_index", debug=debug_multi) # For a desired accuracy of 2**-n, bits vi so that i >= (vx_exp + n + 4) are not needed, because they contribute less than # 2**-n to the result lsb_exp = -(vx_exp + n + 4) lsb_exp.set_attributes(tag="lsb_exp", debug=debug_multi) lsb_exp = Conversion(lsb_exp, precision=ML_Int32) # Index of the corresponding chunk lsb_index = (cst_msb_node - lsb_exp) / chunk_size_cst lsb_index.set_attributes(tag="lsb_index", debug=debug_multi) # Splitting vx half_size = precision.get_field_size() / 2 + 1 # hi part (most significant digit) of vx input vx_hi = TypeCast(BitLogicAnd( TypeCast(vx, precision=int_precision), Constant(~int(2**half_size - 1), precision=int_precision)), precision=precision) vx_hi.set_attributes(tag="vx_hi_ph") #, debug = debug_multi) vx_lo = vx - vx_hi vx_lo.set_attributes(tag="vx_lo_ph") #, debug = debug_multi) # loop iterator variable vi = Variable("i", precision=ML_Int32, var_type=Variable.Local) # step scaling factor half_scaling = Constant(S2**(-chunk_size / 2), precision=precision) i1 = Constant(1, precision=ML_Int32) # accumulator to the output precision acc = Variable("acc", precision=precision, var_type=Variable.Local) # integer accumulator acc_int = Variable("acc_int", precision=int_precision, var_type=Variable.Local) init_loop = Statement( vx_hi, vx_lo, ReferenceAssign(vi, msb_index), ReferenceAssign(acc, Constant(0, precision=precision)), ReferenceAssign(acc_int, Constant(0, precision=int_precision)), ) cst_load = TableLoad(cst_table, vi, 0, tag="cst_load", debug=debug_precision) sca_load = TableLoad(scale_table, vi, 0, tag="sca_load", debug=debug_precision) # loop body # hi_mult = vx_hi * <scale_factor> * <cst> hi_mult = (vx_hi * sca_load) * (cst_load * sca_load) hi_mult.set_attributes(tag="hi_mult", debug=debug_precision) pre_hi_mult_int = NearestInteger(hi_mult, precision=int_precision, tag="hi_mult_int", debug=(debuglld if debug else None)) hi_mult_int_f = Conversion(pre_hi_mult_int, precision=precision, tag="hi_mult_int_f", debug=debug_precision) pre_hi_mult_red = (hi_mult - hi_mult_int_f).modify_attributes( tag="hi_mult_red", debug=debug_precision) # for the first chunks (vx_hi * <constant chunk>) exceeds 2**k+1 and may be # discard (whereas it may lead to overflow during integer conversion pre_exclude_hi = ((cst_msb_node - (vi + i1) * chunk_size + i1) + (vx_exp + Constant(-half_size + 1, precision=ML_Int32)) ).modify_attributes(tag="pre_exclude_hi", debug=(debugd if debug else None)) pre_exclude_hi.propagate_precision(ML_Int32, [cst_msb_node, vi, vx_exp, i1]) Ck = Constant(k, precision=ML_Int32) exclude_hi = pre_exclude_hi <= Ck exclude_hi.set_attributes(tag="exclude_hi", debug=debug_multi) hi_mult_red = Select(exclude_hi, pre_hi_mult_red, Constant(0, precision=precision)) hi_mult_int = Select(exclude_hi, pre_hi_mult_int, Constant(0, precision=int_precision)) # lo part of the chunk reduction lo_mult = (vx_lo * sca_load) * (cst_load * sca_load) lo_mult.set_attributes(tag="lo_mult") #, debug = debug_multi) lo_mult_int = NearestInteger(lo_mult, precision=int_precision, tag="lo_mult_int") #, debug = debug_multi lo_mult_int_f = Conversion(lo_mult_int, precision=precision, tag="lo_mult_int_f") #, debug = debug_multi) lo_mult_red = (lo_mult - lo_mult_int_f).modify_attributes( tag="lo_mult_red") #, debug = debug_multi) # accumulating fractional part acc_expr = (acc + hi_mult_red) + lo_mult_red # accumulating integer part int_expr = ((acc_int + hi_mult_int) + lo_mult_int) % 2**(k + 1) CF1 = Constant(1, precision=precision) CI1 = Constant(1, precision=int_precision) # extracting exceeding integer part in fractionnal accumulator acc_expr_int = NearestInteger(acc_expr, precision=int_precision) # normalizing integer and fractionnal accumulator by subtracting then # adding exceeding integer part normalization = Statement( ReferenceAssign( acc, acc_expr - Conversion(acc_expr_int, precision=precision)), ReferenceAssign(acc_int, int_expr + acc_expr_int), ) acc_expr.set_attributes(tag="acc_expr") #, debug = debug_multi) int_expr.set_attributes(tag="int_expr") #, debug = debug_multi) red_loop = Loop( init_loop, vi <= lsb_index, Statement(acc_expr, int_expr, normalization, ReferenceAssign(vi, vi + 1))) result = Statement(lsb_index, msb_index, red_loop) # restoring sollya's global precision sollya.settings.prec = old_global_prec return result, acc, acc_int
def eval_argument_reduction(self, size1, prec1, size2, prec2): one = Constant(1, precision = ML_Exact, tag = "one") dx = Variable("dx", precision = ML_Custom_FixedPoint_Format(0, 52, False), interval = Interval(0, 1 - S2**-52)) # do the argument reduction x = Addition(dx, one, tag = "x", precision = ML_Exact) x1 = Conversion(x, tag = "x1", precision = ML_Custom_FixedPoint_Format(0, size1, False), rounding_mode = ML_RoundTowardMinusInfty) s = Multiplication(Subtraction(x1, one, precision = ML_Exact), Constant(S2**size1, precision = ML_Exact), precision = ML_Exact, tag = "indexTableX") inv_x1 = Division(one, x1, tag = "ix1", precision = ML_Exact) inv_x = Conversion(inv_x1, tag = "ix", precision = ML_Custom_FixedPoint_Format(1, prec1, False), rounding_mode = ML_RoundTowardPlusInfty) y = Multiplication(x, inv_x, tag = "y", precision = ML_Exact) dy = Subtraction(y, one, tag = "dy", precision = ML_Exact) y1 = Conversion(y, tag = "y", precision = ML_Custom_FixedPoint_Format(0,size2,False), rounding_mode = ML_RoundTowardMinusInfty) t = Multiplication(Subtraction(y1, one, precision = ML_Exact), Constant(S2**size2, precision = ML_Exact), precision = ML_Exact, tag = "indexTableY") inv_y1 = Division(one, y1, tag = "iy1", precision = ML_Exact) inv_y = Conversion(inv_y1, tag = "iy", precision = ML_Custom_FixedPoint_Format(1,prec2,False), rounding_mode = ML_RoundTowardPlusInfty) z = Multiplication(y, inv_y, tag = "z", precision = ML_Exact) dz = Subtraction(z, one, tag = "dz", precision = ML_Exact) # add the necessary goals and hints dx_gappa = Variable("dx_gappa", interval = dx.get_interval(), precision = dx.get_precision()) swap_map = {dx: dx_gappa} # goals (main goal: dz, the result of the argument reduction) gappa_code = self.gappa_engine.get_interval_code_no_copy(dz.copy(swap_map), bound_list = [dx_gappa]) self.gappa_engine.add_goal(gappa_code, dy.copy(swap_map)) self.gappa_engine.add_goal(gappa_code, s.copy(swap_map)) # range of index of table 1 self.gappa_engine.add_goal(gappa_code, t.copy(swap_map)) # range of index of table 2 # hints. are the ones with isAppox=True really necessary ? self.gappa_engine.add_hint(gappa_code, x.copy(swap_map), x1.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, y.copy(swap_map), y1.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, inv_x1.copy(swap_map), inv_x.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, inv_y1.copy(swap_map), inv_y.copy(swap_map), isApprox = True) self.gappa_engine.add_hint(gappa_code, Multiplication(x1, inv_x1, precision = ML_Exact).copy(swap_map), one, Comparison(swap_map[inv_x1], Constant(0, precision = ML_Exact), specifier = Comparison.NotEqual, precision = ML_Bool)) self.gappa_engine.add_hint(gappa_code, Multiplication(y1, inv_y1, precision = ML_Exact).copy(swap_map), one, Comparison(swap_map[inv_y1], Constant(0, precision = ML_Exact), specifier = Comparison.NotEqual, precision = ML_Bool)) toto = Variable("toto", precision = ML_Binary64) self.gappa_engine.add_hypothesis(gappa_code, toto, Interval(0, S2**-52)) # execute and parse the result result = execute_gappa_script_extract(gappa_code.get(self.gappa_engine)) self.gappa_engine.clear_memoization_map() # avoid memory leak #print result['indexTableX'], result['indexTableY'] length_table1 = 1 + floor(sup(result['indexTableX'])).getConstantAsInt() length_table2 = 1 + floor(sup(result['indexTableY'])).getConstantAsInt() if False and (length_table2 != 1 + floor(sup(result['dy']) * S2**size2).getConstantAsInt()): print "(dy*2**size2:", 1 + floor(sup(result['dy']*S2**size2)).getConstantAsInt(), ")" print "(indexTableY:", 1 + floor(sup(result['indexTableY'])).getConstantAsInt(), ")" print result['indexTableY'], result['dy'] sys.exit(1) return { # arguments 'size1': size1, 'prec1': prec1, 'size2': size2, 'prec2': prec2, # size of the tables 'length_table1': length_table1, 'length_table2': length_table2, 'sizeof_table1': length_table1 * (16 + ML_Custom_FixedPoint_Format(1,prec1,False).get_c_bit_size()/8), 'sizeof_table2': length_table2 * (16 + ML_Custom_FixedPoint_Format(1,prec2,False).get_c_bit_size()/8), # intervals 'in_interval': dx.get_interval(), 'mid_interval': result['dy'], 'out_interval': result['goal'], }
def generate_reduction_fptaylor(x): # get k, must be the same at endpoints unround_k = x * n_invlog2 k_low = sollya.floor(sollya.inf(unround_k)) k_high = sollya.floor(sollya.sup(unround_k)) if not (k_low == k_high or (k_low == -1 and sollya.sup(x) == 0)): assert False, "Interval must not straddle multples of log(2)" k = int(k_low) r = x - k * n_log2 twok = 2**k x_low = sollya.inf(x) x_high = sollya.sup(x) query = "\n".join([ "Variables", " real x in [{},{}];".format(x_low, x_high), "Definitions", " whole rnd64= {} * {};".format(k, n_log2), " r rnd64= x - whole;", " poly rnd64= {};".format(poly_expr), " retval rnd64= poly*{};".format(twok), "Expressions", " retval;" ]) rnd_rel_err = None rnd_abs_err = None try: res = fptaylor.Result(query, { **config, "--rel-error": "true", "--abs-error": "true" }) rnd_rel_err = float( res.result["relative_errors"]["final_total"]["value"]) rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except AssertionError: pass except KeyError: try: rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except KeyError: pass if rnd_abs_err is None: try: res = fptaylor.Result(query, { **config, "--rel-error": "false", "--abs-error": "true" }) rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except AssertionError: pass err_int = sollya.supnorm(self.poly_object.get_sollya_object(), sollya.exp(sollya.x), r, sollya.relative, 2**-100) algo_rel_err = sollya.sup(err_int) err_int = sollya.supnorm(self.poly_object.get_sollya_object(), sollya.exp(sollya.x), r, sollya.absolute, 2**-100) algo_abs_err = sollya.sup(err_int) if rnd_rel_err is None or str(algo_rel_err) == "error": rel_err = float("inf") else: rel_err = rnd_rel_err + algo_rel_err abs_err = rnd_abs_err + algo_abs_err return rel_err, abs_err
def split_domain(starting_domain, slivers): in_domains = [starting_domain] # abs out_domains = list() for I in in_domains: if sollya.inf(I) < 0 and sollya.sup(I) > 0: out_domains.append(sollya.Interval(sollya.inf(I), 0)) out_domains.append(sollya.Interval(0, sollya.sup(I))) else: out_domains.append(I) in_domains = out_domains # k out_domains = list() while len(in_domains) > 0: I = in_domains.pop() unround_mult = I * n_invlog2 mult_low = sollya.floor(sollya.inf(unround_mult)) mult_high = sollya.floor(sollya.sup(unround_mult)) #print("in: [{}, {}] ({}, {})".format(float(sollya.inf(I)), float(sollya.sup(I)), int(mult_low), int(mult_high))) if mult_low == mult_high or (mult_low == -1 and mult_high == 0): #print(" accepted") out_domains.append(I) continue k_range = sollya.Interval(mult_low, mult_low + 1.5) I_range = k_range * n_log2 for _ in range(100): mid = sollya.mid(I_range) k = sollya.floor(mid * n_invlog2) if k == mult_low: I_range = sollya.Interval(mid, sollya.sup(I_range)) else: I_range = sollya.Interval(sollya.inf(I_range), mid) divider_high = sollya.sup(I_range) divider_low = sollya.inf(I_range) lower_part = sollya.Interval(sollya.inf(I), divider_low) upper_part = sollya.Interval(divider_high, sollya.sup(I)) #print(" -> [{}, {}]".format(float(sollya.inf(lower_part)), float(sollya.sup(lower_part)))) #print(" -> [{}, {}]".format(float(sollya.inf(upper_part)), float(sollya.sup(upper_part)))) in_domains.append(upper_part) in_domains.append(lower_part) in_domains = out_domains # subdivide each section into 2**subd sections for _ in range(slivers): out_domains = list() for I in in_domains: mid = sollya.mid(I) out_domains.append(sollya.Interval(sollya.inf(I), mid)) out_domains.append(sollya.Interval(mid, sollya.sup(I))) in_domains = out_domains in_domains = set(in_domains) in_domains = sorted(in_domains, key=lambda x: float(sollya.inf(x))) in_domains = [ d for d in in_domains if sollya.inf(d) != sollya.sup(d) ] return in_domains
def generate_scheme(self): """ main scheme generation """ int_size = 3 frac_size = self.width - int_size input_precision = fixed_point(int_size, frac_size) output_precision = fixed_point(int_size, frac_size) expected_interval = {} # declaring main input variable var_x = self.implementation.add_input_signal("x", input_precision) x_interval = Interval(-10.3, 10.7) var_x.set_interval(x_interval) expected_interval[var_x] = x_interval var_y = self.implementation.add_input_signal("y", input_precision) y_interval = Interval(-17.9, 17.2) var_y.set_interval(y_interval) expected_interval[var_y] = y_interval var_z = self.implementation.add_input_signal("z", input_precision) z_interval = Interval(-7.3, 7.7) var_z.set_interval(z_interval) expected_interval[var_z] = z_interval cst = Constant(42.5, tag="cst") expected_interval[cst] = Interval(42.5) conv_ceil = Ceil(var_x, tag="ceil") expected_interval[conv_ceil] = sollya.ceil(x_interval) conv_floor = Floor(var_y, tag="floor") expected_interval[conv_floor] = sollya.floor(y_interval) mult = var_z * var_x mult.set_tag("mult") mult_interval = z_interval * x_interval expected_interval[mult] = mult_interval large_add = (var_x + var_y) - mult large_add.set_attributes(tag="large_add") large_add_interval = (x_interval + y_interval) - mult_interval expected_interval[large_add] = large_add_interval reduced_result = Max(0, Min(large_add, 13)) reduced_result.set_tag("reduced_result") reduced_result_interval = interval_max( Interval(0), interval_min(large_add_interval, Interval(13))) expected_interval[reduced_result] = reduced_result_interval select_result = Select(var_x > var_y, reduced_result, var_z, tag="select_result") select_interval = interval_union(reduced_result_interval, z_interval) expected_interval[select_result] = select_interval # checking interval evaluation for var in [ cst, var_x, var_y, mult, large_add, reduced_result, select_result, conv_ceil, conv_floor ]: interval = evaluate_range(var) expected = expected_interval[var] print("{}: {} vs expected {}".format(var.get_tag(), interval, expected)) assert not interval is None assert interval == expected return [self.implementation]
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = self.precision # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) base_precision = self.precision.get_base_format() p = base_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_precision = ML_StdLogicVectorFormat(base_precision.get_exponent_size()) mant_precision = ML_StdLogicVectorFormat(base_precision.get_mantissa_size()) mant_vx = MantissaExtraction(vx, precision = mant_precision) mant_vy = MantissaExtraction(vy, precision = mant_precision) exp_vx = RawExponentExtraction(vx, precision = exp_precision) exp_vy = RawExponentExtraction(vy, precision = exp_precision) sign_vx = CopySign(vx, precision = ML_StdLogic) sign_vy = CopySign(vy, precision = ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') effective_op = BitLogicXor(sign_vx, sign_vy, precision = ML_StdLogic, tag = "effective_op", debug=debug_std) ## Wrapper for zero extension # @param op the input operation tree # @param s integer size of the extension # @return the Zero extended operation node def zext(op,s): op_size = op.get_precision().get_bit_size() ext_precision = ML_StdLogicVectorFormat(op_size + s) return ZeroExt(op, s, precision = ext_precision) ## Generate the right zero extended output from @p optree def rzext(optree, ext_size): op_size = optree.get_precision().get_bit_size() ext_format = ML_StdLogicVectorFormat(ext_size) out_format = ML_StdLogicVectorFormat(op_size + ext_size) return Concatenation(optree, Constant(0, precision = ext_format), precision = out_format) exp_bias = p + 2 exp_precision_ext = fixed_point(base_precision.get_exponent_size() + 2, 0) exp_precision = fixed_point(base_precision.get_exponent_size(), 0, signed=False) # Y is first aligned p+2 bit to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + precision + 2 # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( Addition( TypeCast(exp_vx, precision=exp_precision), Constant(exp_bias, precision=exp_precision_ext), ), TypeCast(exp_vy, precision=exp_precision), ) exp_diff_lt_0 = Comparison(exp_diff, Constant(0, precision=exp_precision_ext), specifier = Comparison.Less, precision = ML_Bool) exp_diff_gt_2pp4 = Comparison(exp_diff, Constant(2*p+4, precision = exp_precision_ext), specifier = Comparison.Greater, precision = ML_Bool) shift_amount_size = int(floor(log2(2*p+4))+1) shift_amount_prec = ML_StdLogicVectorFormat(shift_amount_size) mant_shift = Select( exp_diff_lt_0, 0, Select( exp_diff_gt_2pp4, Constant(2*p+4), exp_diff, ), tag = "mant_shift", debug = debug_dec ) mant_shift = TypeCast( Conversion(mant_shift, precision=fixed_point(shift_amount_size, 0, signed=False)), precision=shift_amount_prec ) mant_ext_size = 2*p+4 shift_prec = ML_StdLogicVectorFormat(3*p+4) shifted_mant_vy = BitLogicRightShift(rzext(mant_vy, mant_ext_size), mant_shift, precision = shift_prec, tag = "shifted_mant_vy", debug = debug_std) mant_vx_ext = zext(rzext(mant_vx, p+2), p+2+1) mant_vx_ext.set_attributes(tag="mant_vx_ext") add_prec = ML_StdLogicVectorFormat(3*p+5) mant_vx_add_op = Select( Comparison( effective_op, Constant(1, precision = ML_StdLogic), precision = ML_Bool, specifier = Comparison.Equal ), Negation(mant_vx_ext, precision = add_prec, tag = "neg_mant_vx"), mant_vx_ext, precision = add_prec, tag = "mant_vx_add_op", debug=debug_cst_dec ) mant_add = UnsignedAddition( zext(shifted_mant_vy, 1), mant_vx_add_op, precision = add_prec, tag = "mant_add", debug=debug_std ) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd( CopySign(mant_add, precision = ML_StdLogic), effective_op, precision = ML_StdLogic, tag = "add_is_negative", debug = debug_std ) # Negate mantissa addition result if it is negative mant_add_abs = Select( Comparison( add_is_negative, Constant(1, precision = ML_StdLogic), specifier = Comparison.Equal, precision = ML_Bool ), Negation(mant_add, precision = add_prec, tag = "neg_mant_add"), mant_add, precision = add_prec, tag = "mant_add_abs" ) res_sign = BitLogicXor(add_is_negative, sign_vy, precision = ML_StdLogic, tag = "res_sign") # Precision for leading zero count lzc_width = int(floor(log2(3*p+5)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) add_lzc = CountLeadingZeros( mant_add_abs, precision=lzc_prec, tag="add_lzc", debug=debug_dec_unsigned ) #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add, add_lzc, precision = add_prec, tag = "res_normed_mant", debug = debug_std) pre_mant_field = SubSignalSelection(res_normed_mant, 2*p+5, 3*p+3, precision = ML_StdLogicVectorFormat(p-1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision = ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision = ML_Integer) round_bit = BitExtraction(res_normed_mant, IntCst(2*p+4)) mant_lsb = BitExtraction(res_normed_mant, IntCst(2*p+5)) sticky_prec = ML_StdLogicVectorFormat(2*p+4) sticky_input = SubSignalSelection( res_normed_mant, 0, 2*p+3, precision = sticky_prec ) sticky_bit = Select( Comparison( sticky_input, Constant(0, precision = sticky_prec), specifier = Comparison.NotEqual, precision = ML_Bool ), Constant(1, precision = ML_StdLogic), Constant(0, precision = ML_StdLogic), precision = ML_StdLogic, tag = "sticky_bit", debug = debug_std ) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd( round_bit, BitLogicOr( sticky_bit, mant_lsb, precision = ML_StdLogic ), precision = ML_StdLogic, tag = "round_increment_RN", debug = debug_std ) rounded_mant = UnsignedAddition( zext(pre_mant_field, 1), round_increment_RN, precision = ML_StdLogicVectorFormat(p), tag = "rounded_mant", debug = debug_std ) rounded_overflow = BitExtraction( rounded_mant, IntCst(p-1), tag = "rounded_overflow", debug = debug_std ) res_mant_field = Select( Comparison( rounded_overflow, Constant(1, precision = ML_StdLogic), specifier = Comparison.Equal, precision = ML_Bool ), SubSignalSelection(rounded_mant, 1, p-1), SubSignalSelection(rounded_mant, 0, p-2), precision = ML_StdLogicVectorFormat(p-1), tag = "final_mant", debug = debug_std ) res_exp_prec_size = base_precision.get_exponent_size() + 2 res_exp_prec = ML_StdLogicVectorFormat(res_exp_prec_size) res_exp_ext = UnsignedAddition( UnsignedSubtraction( UnsignedAddition( zext(exp_vx, 2), Constant(3+p, precision = res_exp_prec), precision = res_exp_prec ), zext(add_lzc, res_exp_prec_size - lzc_width), precision = res_exp_prec ), rounded_overflow, precision = res_exp_prec, tag = "res_exp_ext", debug = debug_std ) res_exp = Truncate(res_exp_ext, precision = ML_StdLogicVectorFormat(base_precision.get_exponent_size()), tag = "res_exp", debug = debug_dec) vr_out = TypeCast( FloatBuild( res_sign, res_exp, res_mant_field, precision = base_precision, ), precision = io_precision, tag = "result", debug = debug_std ) self.implementation.add_output_signal("vr_out", vr_out) return [self.implementation]
def generate_scheme(self): """Produce an abstract scheme for the logarithm. This abstract scheme will be used by the code generation backend. """ if self.precision not in [ML_Binary32, ML_Binary64]: Log.report(Log.Error, "The demanded precision is not supported") vx = self.implementation.add_input_variable("x", self.precision) def default_bool_convert(optree, precision=None, **kw): return bool_convert(optree, precision, -1, 0, **kw) \ if isinstance(self.processor, VectorBackend) \ else bool_convert(optree, precision, 1, 0, **kw) precision = self.precision.sollya_object int_prec = self.precision.get_integer_format() Log.report(Log.Info, "int_prec is %s" % int_prec) uint_prec = self.precision.get_unsigned_integer_format() Log.report(Log.Info, "MDL constants") cgpe_scheme_idx = int(self.cgpe_index) table_index_size = int(self.tbl_index_size) # table_nb_elements = 2**(table_index_size) table_dimensions = [2*table_nb_elements] # two values are stored for each element field_size = Constant(self.precision.get_field_size(), precision = int_prec, tag = 'field_size') if self.log_radix == EXP_1: log2_hi = Constant( round(log(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log(2) - round(log(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') elif self.log_radix == 10: log2_hi = Constant( round(log10(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log10(2) - round(log10(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') # ... if log_radix == '2' then log2(2) == 1 # subnormal_mask aims at trapping positive subnormals except zero. # That's why we will subtract 1 to the integer bitstring of the input, and # then compare for Less (strict) the resulting integer bitstring to this # mask, e.g. 0x7fffff for binary32. if self.no_subnormal == False: subnormal_mask = Constant((1 << self.precision.get_field_size()) - 1, precision = int_prec, tag = 'subnormal_mask') fp_one = Constant(1.0, precision = self.precision, tag = 'fp_one') fp_one_as_uint = TypeCast(fp_one, precision = uint_prec, tag = 'fp_one_as_uint') int_zero = Constant(0, precision = int_prec, tag = 'int_zero') int_one = Constant(1, precision = int_prec, tag = 'int_one') table_mantissa_half_ulp = Constant( 1 << (self.precision.field_size - table_index_size - 1), precision = int_prec ) table_s_exp_index_mask = Constant( ~((table_mantissa_half_ulp.get_value() << 1) - 1), precision = uint_prec ) Log.report(Log.Info, "MDL table") # The table holds approximations of -log(2^tau * r_i) so we first compute # the index value for which tau changes from 1 to 0. cut = sqrt(2.) tau_index_limit = floor(table_nb_elements * (2./cut - 1)) sollya_logtbl = [ (-log1p(float(i) / table_nb_elements) + (0 if i <= tau_index_limit else log(2.))) / log(self.log_radix) for i in range(table_nb_elements) ] # ... init_logtbl_hi = [ round(sollya_logtbl[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl_lo = [ round(sollya_logtbl[i] - init_logtbl_hi[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl = [tmp[i] for i in range(len(init_logtbl_hi)) for tmp in [init_logtbl_hi, init_logtbl_lo]] log1p_table = ML_NewTable(dimensions = table_dimensions, storage_precision = self.precision, init_data = init_logtbl, tag = 'ml_log1p_table') # ... if self.no_rcp: sollya_rcptbl = [ (1/((1+float(i)/table_nb_elements)+2**(-1-int(self.tbl_index_size)))) for i in range(table_nb_elements) ] init_rcptbl = [ round(sollya_rcptbl[i], int(self.tbl_index_size)+1, # self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] rcp_table = ML_NewTable(dimensions = [table_nb_elements], storage_precision = self.precision, init_data = init_rcptbl, tag = 'ml_rcp_table') # ... Log.report(Log.Info, 'MDL unified subnormal handling') vx_as_int = TypeCast(vx, precision = int_prec, tag = 'vx_as_int') if self.no_subnormal == False: vx_as_uint = TypeCast(vx, precision = uint_prec, tag = 'vx_as_uint') # Avoid the 0.0 case by subtracting 1 from vx_as_int tmp = Comparison(vx_as_int - 1, subnormal_mask, specifier = Comparison.Less) is_subnormal = default_bool_convert( tmp, # Will catch negative values as well as NaNs with sign bit set precision = int_prec) is_subnormal.set_attributes(tag = "is_subnormal") if not(isinstance(self.processor, VectorBackend)): is_subnormal = Subtraction(Constant(0, precision = int_prec), is_subnormal, precision = int_prec) ################################################# # Vectorizable integer based subnormal handling # ################################################# # 1. lzcnt # custom lzcount-like for subnormal numbers using FPU (see draft article) Zi = BitLogicOr(vx_as_uint, fp_one_as_uint, precision = uint_prec, tag="Zi") Zf = Subtraction( TypeCast(Zi, precision = self.precision), fp_one, precision = self.precision, tag="Zf") # Zf exponent is -(nlz(x) - exponent_size). # 2. compute shift value # Vectorial comparison on x86+sse/avx is going to look like # '|0x00|0xff|0x00|0x00|' and that's why we use Negate. # But for scalar code generation, comparison will rather be either 0 or 1 # in C. Thus mask below won't be correct for a scalar implementation. # FIXME: Can we know the backend that will be called and choose in # consequence? Should we make something arch-agnostic instead? # n_value = BitLogicAnd( Addition( DirtyExponentExtraction(Zf, self.precision), Constant( self.precision.get_bias(), precision = int_prec), precision = int_prec), is_subnormal, precision = int_prec, tag = "n_value") alpha = Negation(n_value, tag="alpha") # # 3. shift left # renormalized_mantissa = BitLogicLeftShift(vx_as_int, value) normal_vx_as_int = BitLogicLeftShift(vx_as_int, alpha) # 4. set exponent to the right value # Compute the exponent to add : (p-1)-(value) + 1 = p-1-value # The final "+ 1" comes from the fact that once renormalized, the # floating-point datum has a biased exponent of 1 #tmp0 = Subtraction( # field_size, # value, # precision = int_prec, # tag="tmp0") # Set the value to 0 if the number is not subnormal #tmp1 = BitLogicAnd(tmp0, is_subnormal) #renormalized_exponent = BitLogicLeftShift( # tmp1, # field_size # ) else: # no_subnormal == True normal_vx_as_int = vx_as_int #normal_vx_as_int = renormalized_mantissa + renormalized_exponent normal_vx = TypeCast(normal_vx_as_int, precision = self.precision, tag = 'normal_vx') # alpha = BitLogicAnd(field_size, is_subnormal, tag = 'alpha') # XXX Extract the mantissa, see if this is supported in the x86 vector # backend or if it still uses the support_lib. vx_mantissa = MantissaExtraction(normal_vx, precision = self.precision) Log.report(Log.Info, "MDL scheme") if self.force_division == True: rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) elif self.no_rcp == False: rcp_m = ReciprocalSeed(vx_mantissa, precision = self.precision) if not self.processor.is_supported_operation(rcp_m): if self.precision == ML_Binary64: # Try using a binary32 FastReciprocal binary32_m = Conversion(vx_mantissa, precision = ML_Binary32) rcp_m = ReciprocalSeed(binary32_m, precision = ML_Binary32) rcp_m = Conversion(rcp_m, precision = ML_Binary64) if not self.processor.is_supported_operation(rcp_m): # FIXME An approximation table could be used instead but for vector # implementations another GATHER would be required. # However this may well be better than a division... rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) else: # ... use a look-up table rcp_shift = BitLogicLeftShift(normal_vx_as_int, self.precision.get_exponent_size() + 1) rcp_idx = BitLogicRightShift(rcp_shift, self.precision.get_exponent_size() + 1 + self.precision.get_field_size() - int(self.tbl_index_size)) rcp_m = TableLoad(rcp_table, rcp_idx, tag = 'rcp_idx', debug = debug_multi) # rcp_m.set_attributes(tag = 'rcp_m') # exponent is normally either 0 or -1, since m is in [1, 2). Possible # optimization? # exponent = ExponentExtraction(rcp_m, precision = self.precision, # tag = 'exponent') ri_round = TypeCast( Addition( TypeCast(rcp_m, precision = int_prec), table_mantissa_half_ulp, precision = int_prec ), precision = uint_prec ) ri_fast_rndn = BitLogicAnd( ri_round, table_s_exp_index_mask, tag = 'ri_fast_rndn', precision = uint_prec ) # u = m * ri - 1 ul = None if self.no_rcp == True: # ... u does not fit on a single word tmp_u, tmp_ul = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = (self.no_fma == False)) fp_minus_one = Constant(-1.0, precision = self.precision, tag = 'fp_minus_one') u, ul = Add212(fp_minus_one, tmp_u, tmp_ul) u.set_attributes(tag='uh') ul.set_attributes(tag='ul') elif self.no_fma == False: u = FusedMultiplyAdd( vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fp_one, specifier = FusedMultiplyAdd.Subtract, tag = 'u') else: # disable FMA # tmph + tmpl = m * ri, where tmph ~ 1 tmph, tmpl = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = False) # u_tmp = tmph - 1 ... exact due to Sterbenz u_tmp = Subtraction(tmph, fp_one, precision = self.precision) # u = u_tmp - tmpl ... exact since the result u is representable as a single word u = Addition(u_tmp, tmpl, precision = self.precision, tag = 'u') unneeded_bits = Constant( self.precision.field_size - table_index_size, precision=uint_prec, tag="unneeded_bits" ) assert self.precision.field_size - table_index_size >= 0 ri_bits = BitLogicRightShift( ri_fast_rndn, unneeded_bits, precision = uint_prec, tag = "ri_bits" ) # Retrieve mantissa's MSBs + first bit of exponent, for tau computation in case # exponent is 0 (i.e. biased 127, i.e. first bit of exponent is set.). # In this particular case, i = 0 but tau is 1 # table_index does not need to be as long as uint_prec might be, # try and keep it the size of size_t. size_t_prec = ML_UInt32 signed_size_t_prec = ML_Int32 table_index_mask = Constant( (1 << (table_index_size + 1)) - 1, precision = size_t_prec ) table_index = BitLogicAnd( Conversion(ri_bits, precision = size_t_prec), table_index_mask, tag = 'table_index', precision = size_t_prec ) # Compute tau using the tau_index_limit value. tmp = default_bool_convert( Comparison( TypeCast(table_index, precision = signed_size_t_prec), Constant(tau_index_limit, precision = signed_size_t_prec), specifier = Comparison.Greater if isinstance(self.processor, VectorBackend) else Comparison.LessOrEqual ), precision = signed_size_t_prec, tag="tmp" ) # A true tmp will typically be -1 for VectorBackends, but 1 for standard C. tau = Conversion( Addition(tmp, Constant(1, precision=signed_size_t_prec), precision = signed_size_t_prec, tag="pre_add") if isinstance(self.processor, VectorBackend) else tmp, precision=int_prec, tag="pre_tau" ) tau.set_attributes(tag = 'tau') # Update table_index: keep only table_index_size bits table_index_hi = BitLogicAnd( table_index, Constant((1 << table_index_size) - 1, precision = size_t_prec), precision = size_t_prec ) # table_index_hi = table_index_hi << 1 table_index_hi = BitLogicLeftShift( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_hi" ) # table_index_lo = table_index_hi + 1 table_index_lo = Addition( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_lo" ) tbl_hi = TableLoad(log1p_table, table_index_hi, tag = 'tbl_hi', debug = debug_multi) tbl_lo = TableLoad(log1p_table, table_index_lo, tag = 'tbl_lo', debug = debug_multi) # Compute exponent e + tau - alpha, but first subtract the bias. if self.no_subnormal == False: tmp_eptau = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) exponent = Subtraction(tmp_eptau, alpha, precision = int_prec) else: exponent = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) # fp_exponent = Conversion(exponent, precision = self.precision, tag = 'fp_exponent') Log.report(Log.Info, 'MDL polynomial approximation') if self.log_radix == EXP_1: sollya_function = log(1 + sollya.x) elif self.log_radix == 2: sollya_function = log2(1 + sollya.x) elif self.log_radix == 10: sollya_function = log10(1 + sollya.x) # ... if self.force_division == True: # rcp accuracy is 2^(-p) boundrcp = 2**(-self.precision.get_precision()) else: boundrcp = 1.5 * 2**(-12) # ... see Intel intrinsics guide if self.precision in [ML_Binary64]: if not self.processor.is_supported_operation(rcp_m): boundrcp = (1+boundrcp)*(1+2**(-24)) - 1 else: boundrcp = 2**(-14) # ... see Intel intrinsics guide arg_red_mag = boundrcp + 2**(-table_index_size-1) + boundrcp * 2**(-table_index_size-1) if self.no_rcp == False: approx_interval = Interval(-arg_red_mag, arg_red_mag) else: approx_interval = Interval(-2**(-int(self.tbl_index_size)+1),2**(-int(self.tbl_index_size)+1)) max_eps = 2**-(2*(self.precision.get_field_size())) Log.report(Log.Info, "max acceptable error for polynomial = {}".format(float.hex(max_eps))) poly_degree = sup( guessdegree( sollya_function, approx_interval, max_eps, ) ) Log.report(Log.Info, "poly degree is ", poly_degree) if self.log_radix == EXP_1: poly_object = Polynomial.build_from_approximation( sollya_function, range(2, int(poly_degree) + 1), # Force 1st 2 coeffs to 0 and 1, resp. # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree - 1), approx_interval, sollya.absolute, 0 + sollya._x_) # Force the first 2 coefficients to 0 and 1, resp. else: # ... == '2' or '10' poly_object = Polynomial.build_from_approximation( sollya_function, range(1, int(poly_degree) + 1), # Force 1st coeff to 0 # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree), approx_interval, sollya.absolute, 0) # Force the first coefficients to 0 Log.report(Log.Info, str(poly_object)) constant_precision = ML_SingleSingle if self.precision == ML_Binary32 \ else ML_DoubleDouble if self.precision == ML_Binary64 \ else None if is_cgpe_available(): log1pu_poly = PolynomialSchemeEvaluator.generate_cgpe_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision, scheme_id = cgpe_scheme_idx ) else: Log.report(Log.Warning, "CGPE not available, falling back to std poly evaluator") log1pu_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision ) # XXX Dirty implementation of double-(self.precision) poly def dirty_poly_node_conversion(node, variable_h, variable_l, use_fma): return dirty_multi_node_expand( node, self.precision, mem_map={variable_h: (variable_h, variable_l)}, fma=use_fma) log1pu_poly_hi, log1pu_poly_lo = dirty_poly_node_conversion(log1pu_poly, u, ul, use_fma=(self.no_fma == False)) log1pu_poly_hi.set_attributes(tag = 'log1pu_poly_hi') log1pu_poly_lo.set_attributes(tag = 'log1pu_poly_lo') # Compute log(2) * (e + tau - alpha) if self.log_radix != 2: # 'e' or '10' log2e_hi, log2e_lo = Mul212(fp_exponent, log2_hi, log2_lo, fma = (self.no_fma == False)) # Add log1p(u) if self.log_radix != 2: # 'e' or '10' tmp_res_hi, tmp_res_lo = Add222(log2e_hi, log2e_lo, log1pu_poly_hi, log1pu_poly_lo) else: tmp_res_hi, tmp_res_lo = Add212(fp_exponent, log1pu_poly_hi, log1pu_poly_lo) # Add -log(2^(tau)/m) approximation retrieved by two table lookups logx_hi = Add122(tmp_res_hi, tmp_res_lo, tbl_hi, tbl_lo)[0] logx_hi.set_attributes(tag = 'logx_hi') scheme = Return(logx_hi, precision = self.precision) return scheme
def generate_scheme(self): """ main scheme generation """ int_size = 3 frac_size = self.width - int_size input_precision = fixed_point(int_size, frac_size) output_precision = fixed_point(int_size, frac_size) expected_interval = {} # declaring main input variable var_x = self.implementation.add_input_signal("x", input_precision) x_interval = Interval(-10.3,10.7) var_x.set_interval(x_interval) expected_interval[var_x] = x_interval var_y = self.implementation.add_input_signal("y", input_precision) y_interval = Interval(-17.9,17.2) var_y.set_interval(y_interval) expected_interval[var_y] = y_interval var_z = self.implementation.add_input_signal("z", input_precision) z_interval = Interval(-7.3,7.7) var_z.set_interval(z_interval) expected_interval[var_z] = z_interval cst = Constant(42.5, tag = "cst") expected_interval[cst] = Interval(42.5) conv_ceil = Ceil(var_x, tag = "ceil") expected_interval[conv_ceil] = sollya.ceil(x_interval) conv_floor = Floor(var_y, tag = "floor") expected_interval[conv_floor] = sollya.floor(y_interval) mult = var_z * var_x mult.set_tag("mult") mult_interval = z_interval * x_interval expected_interval[mult] = mult_interval large_add = (var_x + var_y) - mult large_add.set_attributes(tag = "large_add") large_add_interval = (x_interval + y_interval) - mult_interval expected_interval[large_add] = large_add_interval var_x_lzc = CountLeadingZeros(var_x, tag="var_x_lzc") expected_interval[var_x_lzc] = Interval(0, input_precision.get_bit_size()) reduced_result = Max(0, Min(large_add, 13)) reduced_result.set_tag("reduced_result") reduced_result_interval = interval_max( Interval(0), interval_min( large_add_interval, Interval(13) ) ) expected_interval[reduced_result] = reduced_result_interval select_result = Select( var_x > var_y, reduced_result, var_z, tag = "select_result" ) select_interval = interval_union(reduced_result_interval, z_interval) expected_interval[select_result] = select_interval # floating-point operation on mantissa and exponents fp_x_range = Interval(-0.01, 100) unbound_fp_var = Variable("fp_x", precision=ML_Binary32, interval=fp_x_range) mant_fp_x = MantissaExtraction(unbound_fp_var, tag="mant_fp_x", precision=ML_Binary32) exp_fp_x = ExponentExtraction(unbound_fp_var, tag="exp_fp_x", precision=ML_Int32) ins_exp_fp_x = ExponentInsertion(exp_fp_x, tag="ins_exp_fp_x", precision=ML_Binary32) expected_interval[unbound_fp_var] = fp_x_range expected_interval[exp_fp_x] = Interval( sollya.floor(sollya.log2(sollya.inf(abs(fp_x_range)))), sollya.floor(sollya.log2(sollya.sup(abs(fp_x_range)))) ) expected_interval[mant_fp_x] = Interval(1, 2) expected_interval[ins_exp_fp_x] = Interval( S2**sollya.inf(expected_interval[exp_fp_x]), S2**sollya.sup(expected_interval[exp_fp_x]) ) # checking interval evaluation for var in [var_x_lzc, exp_fp_x, unbound_fp_var, mant_fp_x, ins_exp_fp_x, cst, var_x, var_y, mult, large_add, reduced_result, select_result, conv_ceil, conv_floor]: interval = evaluate_range(var) expected = expected_interval[var] print("{}: {}".format(var.get_tag(), interval)) print(" vs expected {}".format(expected)) assert not interval is None assert interval == expected return [self.implementation]
def generate_argument_reduction(self, memory_limit): best_arg_reduc = None best_arg_reduc = self.eval_argument_reduction(6,10,12,13) best_arg_reduc['sizeof_tables'] = best_arg_reduc['sizeof_table1'] + best_arg_reduc['sizeof_table2'] best_arg_reduc['degree_poly1'] = 4 best_arg_reduc['degree_poly2'] = 8 return best_arg_reduc # iterate through all possible parameters, and return the best argument reduction # the order of importance of the caracteristics of a good argument reduction is: # 1- the argument reduction is valid # 2- the degree of the polynomials obtains are minimals # 3- the memory used is minimal # An arument reduction is valid iff: # - the memory used is less than memory_limit # - y-1 and z-1 fit into a uint64_t # - the second argument reduction should usefull (ie: it should add at least 1 bit to the argument reduction) # From thoses validity constraint we deduce some bound on the parameters to reduce the space of value searched: # (note that thoses bound are implied by, but not equivalents to the constraints) # size1 <= log2(memory_limit/17) (memory_limit on the first table) # prec1 < 13 + size1 (y-1 fits into a uint64_t) # size2 <= log2((memory_limit - sizeof_table1)/17/midinterval) (memory_limit on both tables) # size2 >= 1 - log2(midinterval) (second arg red should be usefull) # prec2 < 12 - prec1 - log2((y-y1)/y1), for all possible y (z-1 fits into a uint64_t) # note: it is hard to deduce a tight bound on prec2 from the last inequality # a good approximation is size2 ~= max[for y]( - log2((y-y1)/y1)), but using it may eliminate valid arg reduc #self.eval_argument_reduction(12, 20, 22, 14) min_size1 = 1 max_size1 = floor(log(memory_limit/17)/log(2)).getConstantAsInt() for size1 in xrange(max_size1, min_size1-1, -1): min_prec1 = size1 max_prec1 = 12 + size1 for prec1 in xrange(min_prec1,max_prec1+1): # we need sizeof_table1 and mid_interval for the bound on size2 and prec2 first_arg_reduc = self.eval_argument_reduction(size1, prec1, prec1, prec1) mid_interval = first_arg_reduc['mid_interval'] sizeof_table1 = first_arg_reduc['sizeof_table1'] if not(0 <= inf(mid_interval) and sup(mid_interval) < S2**(64 - 52 - prec1)): continue if not(first_arg_reduc['sizeof_table1'] < memory_limit): continue min_size2 = 1 - ceil(log(sup(mid_interval))/log(2)).getConstantAsInt() max_size2 = floor(log((memory_limit - sizeof_table1)/(17 * sup(mid_interval)))/log(2)).getConstantAsInt() # during execution of the prec2 loop, it can reduces the interval of valid values for prec2 # so min_prec2 and max_prec2 are setted here and not before the the prec2 loop # (because they are modified inside the body of the loop, for the next iteration of size2) min_prec2 = 0 max_prec2 = 12 + max_size2 - prec1 for size2 in xrange(max_size2,min_size2-1,-1): max_prec2 = min(max_prec2, 12 + size2 - prec1) for prec2 in xrange(max_prec2,min_prec2-1,-1): #print '=====\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{})\t====='.format(size1,min_size1,max_size1,prec1,min_prec1,max_prec1,size2,min_size2,max_size2,prec2,min_prec2,max_prec2) #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss #memory used by the programm arg_reduc = self.eval_argument_reduction(size1, prec1, size2, prec2) mid_interval = arg_reduc['mid_interval'] out_interval = arg_reduc['out_interval'] sizeof_tables = arg_reduc['sizeof_table1'] + arg_reduc['sizeof_table2'] if not(0 <= inf(out_interval) and sup(out_interval) < S2**(64-52-prec1-prec2)): max_prec2 = prec2 - 1 continue if memory_limit < sizeof_tables: continue #assert(prec2 < 12 + size2 - prec1) # test the approximation size2 ~= max[for y]( - log2((y-y1)/y1)) # guess the degree of the two polynomials (relative error <= 2^-52 and absolute error <= 2^-120) # note: we exclude zero from out_interval to not perturb sollya (log(1+x)/x is not well defined on 0) sollya_out_interval = Interval(S2**(-52-prec1-prec2), sup(out_interval)) guess_degree_poly1 = guessdegree(log(1+sollya.x)/sollya.x, sollya_out_interval, S2**-52) guess_degree_poly2 = guessdegree(log(1+sollya.x), sollya_out_interval, S2**-120) # TODO: detect when guessdegree return multiple possible degree, and find the right one if False and inf(guess_degree_poly1) <> sup(guess_degree_poly1): print "improvable guess_degree_poly1:", guess_degree_poly1 if False and inf(guess_degree_poly2) <> sup(guess_degree_poly2): print "improvable guess_degree_poly2:", guess_degree_poly2 degree_poly1 = sup(guess_degree_poly1).getConstantAsInt() + 1 degree_poly2 = sup(guess_degree_poly2).getConstantAsInt() if ((best_arg_reduc is not None) and (best_arg_reduc['degree_poly1'] < degree_poly1 or best_arg_reduc['degree_poly2'] < degree_poly2)): min_prec2 = prec2 + 1 break if ((best_arg_reduc is None) or (best_arg_reduc['degree_poly1'] > degree_poly1) or (best_arg_reduc['degree_poly1'] == degree_poly1 and best_arg_reduc['degree_poly2'] > degree_poly2) or (best_arg_reduc['degree_poly1'] == degree_poly1 and best_arg_reduc['degree_poly2'] == degree_poly2 and best_arg_reduc['sizeof_tables'] > sizeof_tables)): arg_reduc['degree_poly1'] = degree_poly1 arg_reduc['degree_poly2'] = degree_poly2 arg_reduc['sizeof_tables'] = sizeof_tables best_arg_reduc = arg_reduc #print "\n --new best-- \n", arg_reduc, "\n" #print "\nBest arg reduc: \n", best_arg_reduc, "\n" return best_arg_reduc
def generate_scalar_scheme(self, vx): abs_vx = Abs(vx, precision=self.precision) FCT_LIMIT = 1.0 one_limit = search_bound_threshold(sollya.erf, FCT_LIMIT, 1.0, 10.0, self.precision) one_limit_exp = int(sollya.floor(sollya.log2(one_limit))) Log.report(Log.Debug, "erf(x) = 1.0 limit is {}, with exp={}", one_limit, one_limit_exp) upper_approx_bound = 10 # empiral numbers eps_exp = {ML_Binary32: -3, ML_Binary64: -5}[self.precision] eps = S2**eps_exp Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(0, eps) # fonction to approximate is erf(x) / x # it is an even function erf(x) / x = erf(-x) / (-x) approx_fct = sollya.erf(sollya.x) - (sollya.x) poly_degree = int( sup( guessdegree(approx_fct, approx_interval, S2** -(self.precision.get_field_size() + 5)))) + 1 poly_degree_list = list(range(1, poly_degree, 2)) Log.report(Log.Debug, "poly_degree is {} and list {}", poly_degree, poly_degree_list) global_poly_object = Polynomial.build_from_approximation( approx_fct, poly_degree_list, [self.precision] * len(poly_degree_list), approx_interval, sollya.relative) Log.report( Log.Debug, "inform is {}", dirtyinfnorm(approx_fct - global_poly_object.get_sollya_object(), approx_interval)) poly_object = global_poly_object.sub_poly(start_index=1, offset=1) ext_precision = { ML_Binary32: ML_SingleSingle, ML_Binary64: ML_DoubleDouble, }[self.precision] pre_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, abs_vx, unified_precision=self.precision) result = FMA(pre_poly, abs_vx, abs_vx) result.set_attributes(tag="result", debug=debug_multi) eps_target = S2**-(self.precision.get_field_size() + 5) def offset_div_function(fct): return lambda offset: fct(sollya.x + offset) # empiral numbers field_size = {ML_Binary32: 6, ML_Binary64: 8}[self.precision] near_indexing = SubFPIndexing(eps_exp, 0, 6, self.precision) near_approx = generic_poly_split(offset_div_function(sollya.erf), near_indexing, eps_target, self.precision, abs_vx) near_approx.set_attributes(tag="near_approx", debug=debug_multi) def offset_function(fct): return lambda offset: fct(sollya.x + offset) medium_indexing = SubFPIndexing(1, one_limit_exp, 7, self.precision) medium_approx = generic_poly_split(offset_function(sollya.erf), medium_indexing, eps_target, self.precision, abs_vx) medium_approx.set_attributes(tag="medium_approx", debug=debug_multi) # approximation for positive values scheme = ConditionBlock( abs_vx < eps, Return(result), ConditionBlock( abs_vx < near_indexing.get_max_bound(), Return(near_approx), ConditionBlock(abs_vx < medium_indexing.get_max_bound(), Return(medium_approx), Return(Constant(1.0, precision=self.precision))))) return scheme
def generate_scheme(self): memory_limit = 2500 # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = input_var kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) ### Constants computations ### v_log2_hi = nearestint(log(2) * 2**-52) * 2**52 v_log2_lo = round(log(2) - v_log2_hi, 64+53, sollya.RN) log2_hi = Constant(v_log2_hi, precision = self.precision, tag = "log2_hi") log2_lo = Constant(v_log2_lo, precision = self.precision, tag = "log2_lo") print "\n\033[1mSearch parameters for the argument reduction:\033[0m (this can take a while)" arg_reduc = self.generate_argument_reduction(memory_limit) print "\n\033[1mArgument reduction found:\033[0m [({},{}),({},{})] -> polynomials of degree {},{}, using {} bytes of memory".format(arg_reduc['size1'],arg_reduc['prec1'],arg_reduc['size2'],arg_reduc['prec2'],arg_reduc['degree_poly1'],arg_reduc['degree_poly2'],arg_reduc['sizeof_tables']) print "\n\033[1mGenerate the first logarithm table:\033[0m containing {} elements, using {} bytes of memory".format(arg_reduc['length_table1'], arg_reduc['sizeof_table1']) inv_table_1 = ML_Table(dimensions = [arg_reduc['length_table1']], storage_precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec1'], False), tag = self.uniquify_name("inv_table_1")) log_table_1 = ML_Table(dimensions = [arg_reduc['length_table1']], storage_precision = ML_Custom_FixedPoint_Format(11, 128-11, False), tag = self.uniquify_name("log_table_1")) for i in xrange(0, arg_reduc['length_table1']-1): x1 = 1 + i/S2*arg_reduc['size1'] inv_x1 = ceil(S2**arg_reduc['prec1']/x1)*S2**arg_reduc['prec1'] log_x1 = floor(log(x1) * S2**(128-11))*S2**(11-128) inv_table_1[i] = inv_x1 #Constant(inv_x1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec1'], False)) log_table_1[i] = log_x1 #Constant(log_x1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) print "\n\033[1mGenerate the second logarithm table:\033[0m containing {} elements, using {} bytes of memory".format(arg_reduc['length_table2'], arg_reduc['sizeof_table2']) inv_table_2 = ML_Table(dimensions = [arg_reduc['length_table2']], storage_precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec2'], False), tag = self.uniquify_name("inv_table_2")) log_table_2 = ML_Table(dimensions = [arg_reduc['length_table2']], storage_precision = ML_Custom_FixedPoint_Format(11, 128-11, False), tag = self.uniquify_name("log_table_2")) for i in xrange(0, arg_reduc['length_table2']-1): y1 = 1 + i/S2**arg_reduc['size2'] inv_y1 = ceil(S2**arg_reduc['prec2']/x1) * S2**arg_reduc['prec2'] log_y1 = floor(log(inv_y1) * S2**(128-11))*S2**(11-128) inv_table_2[i] = inv_y1 #Constant(inv_y1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec2'], False)) log_table_2[i] = log_y1 #Constant(log_y1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) ### Evaluation Scheme ### print "\n\033[1mGenerate the evaluation scheme:\033[0m" input_var = self.implementation.add_input_variable("input_var", self.precision) ve = ExponentExtraction(input_var, tag = "x_exponent", debug = debugd) vx = MantissaExtraction(input_var, tag = "x_mantissa", precision = ML_Custom_FixedPoint_Format(0,52,False), debug = debug_lftolx) #vx = MantissaExtraction(input_var, tag = "x_mantissa", precision = self.precision, debug = debug_lftolx) print "filtering and handling special cases" test_is_special_cases = LogicalNot(Test(input_var, specifier = Test.IsIEEENormalPositive, likely = True, debug = debugd, tag = "is_special_cases")) handling_special_cases = Statement( ConditionBlock( Test(input_var, specifier = Test.IsSignalingNaN, debug = True), ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision)) ), ConditionBlock( Test(input_var, specifier = Test.IsNaN, debug = True), Return(input_var) )#, # TODO: add tests for x == 0 (raise DivideByZero, return -Inf), x < 0 (raise InvalidOperation, return qNaN) # all that remains is x is a subnormal positive #Statement( # ReferenceAssign(Dereference(ve), Subtraction(ve, Subtraction(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(12, precision = ve.get_precision())))), # ReferenceAssign(Dereference(vx), BitLogicLeftShift(vx, Addition(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(1, precision = ve.get_precision())))) #) ) print "doing the argument reduction" v_dx = vx v_x1 = Conversion(v_dx, tag = 'x1', precision = ML_Custom_FixedPoint_Format(0,arg_reduc['size1'],False), rounding_mode = ML_RoundTowardMinusInfty) v_index_x = TypeCast(v_x1, tag = 'index_x', precision = ML_Int32) #ML_Custom_FixedPoint_Format(v_x1.get_precision().get_c_bit_size(), 0, False)) v_inv_x = TableLoad(inv_table_1, v_index_x, tag = 'inv_x') v_x = Addition(v_dx, 1, tag = 'x', precision = ML_Custom_FixedPoint_Format(1,52,False)) v_dy = Multiplication(v_x, v_inv_x, tag = 'dy', precision = ML_Custom_FixedPoint_Format(0,52+arg_reduc['prec1'],False)) v_y1 = Conversion(v_dy, tag = 'y1', precision = ML_Custom_FixedPoint_Format(0,arg_reduc['size2'],False), rounding_mode = ML_RoundTowardMinusInfty) v_index_y = TypeCast(v_y1, tag = 'index_y', precision = ML_Int32) #ML_Custom_FixedPoint_Format(v_y1.get_precision().get_c_bit_size(), 0, False)) v_inv_y = TableLoad(inv_table_2, v_index_y, tag = 'inv_y') v_y = Addition(v_dy, 1, tag = 'y', precision = ML_Custom_FixedPoint_Format(1,52+arg_reduc['prec2'],False)) # note that we limit the number of bits used to represent dz to 64. # we proved during the arg reduction that we can do that (sup(out_interval) < 2^(64-52-prec1-prec2)) v_dz = Multiplication(v_y, v_inv_y, tag = 'z', precision = ML_Custom_FixedPoint_Format(64-52-arg_reduc['prec1']-arg_reduc['prec2'],52+arg_reduc['prec1']+arg_reduc['prec2'],False)) # reduce the number of bits used to represent dz. we can do that print "doing the first polynomial evaluation" global_poly1_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, arg_reduc['degree_poly1']-1, [64] * (arg_reduc['degree_poly1']), arg_reduc['out_interval'], fixed, sollya.absolute) poly1_object = global_poly1_object.sub_poly(start_index = 1) print global_poly1_object print poly1_object poly1 = PolynomialSchemeEvaluator.generate_horner_scheme(poly1_object, v_dz, unified_precision = v_dz.get_precision()) return ConditionBlock(test_is_special_cases, handling_special_cases, Return(poly1)) #approx_interval = Interval(0, 27021597764222975*S2**-61) #poly_degree = 1+sup(guessdegree(log(1+x)/x, approx_interval, S2**-(self.precision.get_field_size()))) #global_poly_object = Polynomial.build_from_approximation(log(1+x)/x, poly_degree, [1] + [self.precision]*(poly_degree), approx_interval, sollya.absolute) #poly_object = global_poly_object.sub_poly(start_index = 1) #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) #_poly.set_attributes(tag = "poly", debug = debug_lftolx) """
def generate_reduction_fptaylor(x): # get sign and abs_x, must be the same at endpoints if sollya.sup(x) <= 0: sign_x_expr = "-1.0" abs_x_expr = "-x" abs_x = -x elif sollya.inf(x) >= 0: sign_x_expr = "1.0" abs_x_expr = "x" abs_x = x else: assert False, "Interval must not straddle 0" # get k, must be the same at endpoints unround_k = abs_x * n_invpi k_low = sollya.floor(sollya.inf(unround_k)) k_high = sollya.floor(sollya.sup(unround_k)) if k_low != k_high: assert False, "Interval must not straddle multples of pi" k = int(k_low) part = k % 2 r_expr = "abs_x - whole" r = abs_x - k * n_pi z_expr = "r" z = r if part == 1: flipped_poly_expr = "-poly" else: flipped_poly_expr = "poly" x_low = sollya.inf(x) x_high = sollya.sup(x) query = "\n".join([ "Variables", " real x in [{},{}];".format(x_low, x_high), "Definitions", " abs_x rnd64= {};".format(abs_x_expr), " whole rnd64= {} * {};".format(k, n_pi), " r rnd64= abs_x - whole;", " z rnd64= {};".format(z_expr), " poly rnd64= {};".format(poly_expr), " flipped_poly rnd64= {};".format(flipped_poly_expr), " retval rnd64= flipped_poly*{};".format(sign_x_expr), "Expressions", " retval;" ]) rnd_rel_err = None rnd_abs_err = None try: res = fptaylor.Result(query, { **config, "--rel-error": "true", "--abs-error": "true" }) rnd_rel_err = float( res.result["relative_errors"]["final_total"]["value"]) rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except AssertionError: pass except KeyError: try: rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except KeyError: pass if rnd_abs_err is None: try: res = fptaylor.Result(query, { **config, "--rel-error": "false", "--abs-error": "true" }) rnd_abs_err = float( res.result["absolute_errors"]["final_total"]["value"]) except AssertionError: pass err_int = sollya.supnorm(self.poly_object.get_sollya_object(), sollya.sin(sollya.x), z, sollya.relative, 2**-100) algo_rel_err = sollya.sup(err_int) err_int = sollya.supnorm(self.poly_object.get_sollya_object(), sollya.sin(sollya.x), z, sollya.absolute, 2**-100) algo_abs_err = sollya.sup(err_int) if rnd_rel_err is None or str(algo_rel_err) == "error": rel_err = float("inf") else: rel_err = rnd_rel_err + algo_rel_err abs_err = rnd_abs_err + algo_abs_err return rel_err, abs_err
approx_interval = Interval(-S2**-5, S2**-5) ctx = MLL_Context(ML_Binary64, approx_interval) vx = Variable("x", precision=ctx.variableFormat, interval=approx_interval) # guessding the best degree poly_degree = int( sup( sollya.guessdegree(sollya.exp(sollya.x), approx_interval, eps_target))) # asking sollya to provide the approximation poly_object = Polynomial.build_from_approximation( sollya.exp(sollya.x), poly_degree, [sollya.doubledouble] * (poly_degree + 1), vx.interval) print("poly object is {}".format(poly_object)) poly_graph, poly_epsilon = mll_implementpoly_horner( ctx, poly_object, eps_target, vx) print("poly_graph is {}".format( poly_graph.get_str(depth=None, display_precision=True))) print("poly epsilon is {}".format(float(poly_epsilon))) print("poly accuracy is {}".format( get_accuracy_from_epsilon(poly_epsilon))) implem_results.append( (eps_target, poly_degree, poly_object, poly_graph, poly_epsilon)) for result in implem_results: eps_target, poly_degree, poly_object, poly_graph, poly_epsilon = result epsilon_log2 = int(sollya.floor(sollya.log2(poly_epsilon))) print("epsilon for eps_target={} (degree={}) is {}".format( eps_target, poly_degree, epsilon_log2))
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def generate_scheme(self): ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = self.precision # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) # rounding mode input rnd_mode = self.implementation.add_input_signal( "rnd_mode", rnd_mode_format) # size of most significant table index (for linear slope tabulation) alpha = self.alpha # 6 # size of medium significant table index (for initial value table index LSB) beta = self.beta # 5 # size of least significant table index (for linear offset tabulation) gamma = self.gamma # 5 guard_bits = self.guard_bits # 3 vx.set_interval(self.interval) range_hi = sollya.sup(self.interval) range_lo = sollya.inf(self.interval) f_hi = self.function(range_hi) f_lo = self.function(range_lo) # fixed by format used for reduced_x range_size = range_hi - range_lo range_size_log2 = int(sollya.log2(range_size)) assert 2**range_size_log2 == range_size print("range_size_log2={}".format(range_size_log2)) reduced_x = Conversion(BitLogicRightShift(vx - range_lo, range_size_log2), precision=fixed_point(0, alpha + beta + gamma, signed=False), tag="reduced_x", debug=debug_fixed) alpha_index = get_fixed_slice(reduced_x, 0, alpha - 1, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromMSBToLSB, tag="alpha_index", debug=debug_std) gamma_index = get_fixed_slice(reduced_x, gamma - 1, 0, align_hi=FixedPointPosition.FromLSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="gamma_index", debug=debug_std) beta_index = get_fixed_slice(reduced_x, alpha, gamma, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="beta_index", debug=debug_std) # Assuming monotonic function f_absmax = max(abs(f_hi), abs(f_lo)) f_absmin = min(abs(f_hi), abs(f_lo)) f_msb = int(sollya.ceil(sollya.log2(f_absmax))) + 1 f_lsb = int(sollya.floor(sollya.log2(f_absmin))) storage_lsb = f_lsb - io_precision.get_bit_size() - guard_bits f_int_size = f_msb f_frac_size = -storage_lsb storage_format = fixed_point(f_int_size, f_frac_size, signed=False) Log.report(Log.Info, "storage_format is {}".format(storage_format)) # table of initial value index tiv_index = Concatenation(alpha_index, beta_index, tag="tiv_index", debug=debug_std) # table of offset value index to_index = Concatenation(alpha_index, gamma_index, tag="to_index", debug=debug_std) tiv_index_size = alpha + beta to_index_size = alpha + gamma Log.report(Log.Info, "initial table structures") table_iv = ML_NewTable(dimensions=[2**tiv_index_size], storage_precision=storage_format, tag="tiv") table_offset = ML_NewTable(dimensions=[2**to_index_size], storage_precision=storage_format, tag="to") slope_table = [None] * (2**alpha) slope_delta = 1.0 / sollya.SollyaObject(2**alpha) delta_u = range_size * slope_delta * 2**-15 Log.report(Log.Info, "computing slope value") for i in range(2**alpha): # slope is computed at the middle of range_size interval slope_x = range_lo + (i + 0.5) * range_size * slope_delta # TODO: gross approximation of derivatives f_xpu = self.function(slope_x + delta_u / 2) f_xmu = self.function(slope_x - delta_u / 2) slope = (f_xpu - f_xmu) / delta_u slope_table[i] = slope range_rcp_steps = 1.0 / sollya.SollyaObject(2**tiv_index_size) Log.report(Log.Info, "computing value for initial-value table") for i in range(2**tiv_index_size): slope_index = i / 2**beta iv_x = range_lo + i * range_rcp_steps * range_size offset_x = 0.5 * range_rcp_steps * range_size # initial value is computed so that the piecewise linear # approximation intersects the function at iv_x + offset_x iv_y = self.function( iv_x + offset_x) - offset_x * slope_table[int(slope_index)] initial_value = storage_format.round_sollya_object(iv_y) table_iv[i] = initial_value # determining table of initial value interval tiv_min = table_iv[0] tiv_max = table_iv[0] for i in range(1, 2**tiv_index_size): tiv_min = min(tiv_min, table_iv[i]) tiv_max = max(tiv_max, table_iv[i]) table_iv.set_interval(Interval(tiv_min, tiv_max)) offset_step = range_size / S2**(alpha + beta + gamma) for i in range(2**alpha): Log.report(Log.Info, "computing offset value for sub-table {}".format(i)) for j in range(2**gamma): to_i = i * 2**gamma + j offset = slope_table[i] * j * offset_step table_offset[to_i] = offset # determining table of offset interval to_min = table_offset[0] to_max = table_offset[0] for i in range(1, 2**(alpha + gamma)): to_min = min(to_min, table_offset[i]) to_max = max(to_max, table_offset[i]) offset_interval = Interval(to_min, to_max) table_offset.set_interval(offset_interval) initial_value = TableLoad(table_iv, tiv_index, precision=storage_format, tag="initial_value", debug=debug_fixed) offset_precision = get_fixed_type_from_interval(offset_interval, 16) print("offset_precision is {} ({} bits)".format( offset_precision, offset_precision.get_bit_size())) table_offset.get_precision().storage_precision = offset_precision # rounding table value for i in range(1, 2**(alpha + gamma)): table_offset[i] = offset_precision.round_sollya_object( table_offset[i]) offset_value = TableLoad(table_offset, to_index, precision=offset_precision, tag="offset_value", debug=debug_fixed) Log.report( Log.Verbose, "initial_value's interval: {}, offset_value's interval: {}".format( evaluate_range(initial_value), evaluate_range(offset_value))) final_add = initial_value + offset_value round_bit = final_add # + FixedPointPosition(final_add, io_precision.get_bit_size(), align=FixedPointPosition.FromMSBToLSB) vr_out = Conversion(initial_value + offset_value, precision=io_precision, tag="vr_out", debug=debug_fixed) self.implementation.add_output_signal("vr_out", vr_out) # Approximation error evaluation approx_error = 0.0 for i in range(2**alpha): for j in range(2**beta): tiv_i = (i * 2**beta + j) # = range_lo + tiv_i * range_rcp_steps * range_size iv = table_iv[tiv_i] for k in range(2**gamma): to_i = i * 2**gamma + k offset = table_offset[to_i] approx_value = offset + iv table_x = range_lo + range_size * ( (i * 2**beta + j) * 2**gamma + k) / S2**(alpha + beta + gamma) local_error = abs(1 / (table_x) - approx_value) approx_error = max(approx_error, local_error) error_log2 = float(sollya.log2(approx_error)) print("approx_error is {}, error_log2 is {}".format( float(approx_error), error_log2)) # table size table_iv_size = 2**(alpha + beta) table_offset_size = 2**(alpha + gamma) print("tables' size are {} entries".format(table_iv_size + table_offset_size)) return [self.implementation]