def testScaleGradientsCheckNumerics(self): """ScaleGradients when enable_check_numerics=True.""" FLAGS.enable_check_numerics = True p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a # Make a NaN gradient. var_grads = py_utils.NestedMap(a=(var_a, 0. * tf.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): tf.global_variables_initializer().run() with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, 'is not finite'): self.assertTrue(scaled_grads_map.has_nan_or_inf.eval()) self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse( tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def QuantizeTensors(self, t_name, ts, eval_only=False): p = self.params # Always straddle a real zero point. if p.is_eval: # At eval/inference time, use the memorized range. # Important: Don't capture these variables in training mode so as to # avoid extra/unnecessary captures. min_var = self._GetQStateVar(t_name, 'min') max_var = self._GetQStateVar(t_name, 'max') return [ self._MaybeFakeQuant(t, min_var, max_var, num_bits=p.bits) for t in ts ] else: # At training time, use the batch calculated min/max. accumulator_name = self._GetAccumulatorNameForTensor(t_name) # Calculate min/max for all tensors. batch_min = 0.0 batch_max = 0.0 for t in ts: batch_min = tf.minimum(tf.reduce_min(t), batch_min) batch_max = tf.maximum(tf.reduce_max(t), batch_max) # New state. state1 = tf.stack([1.0, batch_min, batch_max]) self.accumulators[accumulator_name].Update(state1) # Results. ts_out = [] for i, t in enumerate(ts): if eval_only: # If only quantizing at eval time, still record ranges as above # but don't quantize. quant_t = t else: # If quantizing during training, skip quantization if it produces # NANs. Sometimes early in the training process, things are unstable # and ranges can produce numerical instability that makes it # impossible to perform a fake_quant. quant_t = self._MaybeFakeQuant(t, batch_min, batch_max, num_bits=p.bits) # TODO(laurenzo): Plumb quant_t_has_nans through state and report. quant_t_has_nans = tf.is_nan(quant_t) quant_t = tf.where(quant_t_has_nans, t, quant_t) ts_out.append(quant_t) summary_utils.histogram( '%s/%s_%d' % (self._qvars_scope.name, t_name, i), t) return ts_out
def QuantizeWeight(self, w): p = self.params w_min = tf.reduce_min(w) w_max = tf.reduce_max(w) # NOTE: We force a small, non-zero range because otherwise, zero weights # can cause downstream inference engines to blow up. w_min = tf.minimum(w_min, -p.quantize_weight_epsilon) w_max = tf.maximum(w_max, p.quantize_weight_epsilon) quant_w = self._MaybeFakeQuant(w, w_min, w_max, num_bits=p.bits) if p.is_eval: return quant_w else: # If quantizing during training, skip quantization if it produces # NANs. Sometimes early in the training process, things are unstable # and ranges can produce numerical instability that makes it # impossible to perform a fake_quant. quant_w_has_nans = tf.is_nan(quant_w) return tf.where(quant_w_has_nans, w, quant_w)
def testScaleGradients(self): p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a var_grads = py_utils.NestedMap(a=(var_a, tf.ones_like(var_a))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) FLAGS.enable_check_numerics = False with self.session(): tf.global_variables_initializer().run() self.assertEqual(1.0, scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing: - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN or Inf in input gradients. - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary(name + '/' + p.name, py_utils.NestedMap(s=vg)) all_grad_norm = tf.sqrt( py_utils.SumSquared([ g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten() ])) all_var_norm = tf.sqrt( py_utils.SumSquared([ v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten() ])) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormCliping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.has_nan_or_inf = has_nan_or_inf return_values.final_var_grads = final_var_grads return return_values