def __init__(self, multiply_by_parameter_scale=True, learning_rate=None, decay_rate=None, beta1=0.0, clipping_threshold=1.0, factored=True, simulated_quantize_bits=None, parameter_encoding=None, use_locking=False, name="Adafactor", epsilon1=1e-30, epsilon2=1e-3): """Construct a new Adafactor optimizer. See class comment. Args: multiply_by_parameter_scale: a boolean learning_rate: an optional Scalar or callable. decay_rate: an optional Scalar. beta1: a float value between 0 and 1 clipping_threshold: an optional float >= 1 factored: a boolean - whether to use factored second-moment estimator for 2d variables simulated_quantize_bits: train with simulated quantized parameters (experimental) parameter_encoding: a ParameterEncoding object to use in the case of bfloat16 variables. use_locking: If True use locks for update operations. name: Optional name for the operations created when applying gradients. Defaults to "AdafactorOptimizer". epsilon1: Regularization constant for squared gradient. epsilon2: Regularization constant for parameter scale. Raises: ValueError: if absolute_update_scale and relative_update_scale_fn are both present or both absent. """ super(AdafactorOptimizer, self).__init__(use_locking, name) self._multiply_by_parameter_scale = multiply_by_parameter_scale if learning_rate is None: learning_rate = self._learning_rate_default( multiply_by_parameter_scale) self._learning_rate = learning_rate if decay_rate is None: decay_rate = self._decay_rate_default() self._decay_rate = decay_rate self._beta1 = beta1 self._clipping_threshold = clipping_threshold self._factored = factored self._simulated_quantize_bits = simulated_quantize_bits self._parameter_encoding = parameter_encoding self._quantization_noise = quantization.noise_from_step_num() self._epsilon1 = epsilon1 self._epsilon2 = epsilon2
def __init__(self, multiply_by_parameter_scale=True, learning_rate=None, decay_rate=None, beta1=0.0, clipping_threshold=1.0, factored=True, simulated_quantize_bits=None, parameter_encoding=None, use_locking=False, name="Adafactor", epsilon1=1e-30, epsilon2=1e-3): """Construct a new Adafactor optimizer. See class comment. Args: multiply_by_parameter_scale: a boolean learning_rate: an optional Scalar. decay_rate: an optional Scalar. beta1: a float value between 0 and 1 clipping_threshold: an optional float >= 1 factored: a boolean - whether to use factored second-moment estimator for 2d variables simulated_quantize_bits: train with simulated quantized parameters (experimental) parameter_encoding: a ParameterEncoding object to use in the case of bfloat16 variables. use_locking: If True use locks for update operations. name: Optional name for the operations created when applying gradients. Defaults to "AdafactorOptimizer". epsilon1: Regularization constant for squared gradient. epsilon2: Regularization constant for parameter scale. Raises: ValueError: if absolute_update_scale and relative_update_scale_fn are both present or both absent. """ super(AdafactorOptimizer, self).__init__(use_locking, name) self._multiply_by_parameter_scale = multiply_by_parameter_scale if learning_rate is None: learning_rate = self._learning_rate_default(multiply_by_parameter_scale) self._learning_rate = learning_rate if decay_rate is None: decay_rate = self._decay_rate_default() self._decay_rate = decay_rate self._beta1 = beta1 self._clipping_threshold = clipping_threshold self._factored = factored self._simulated_quantize_bits = simulated_quantize_bits self._parameter_encoding = parameter_encoding self._quantization_noise = quantization.noise_from_step_num() self._epsilon1 = epsilon1 self._epsilon2 = epsilon2