def testDiscreteBottleneckVQCond(self): hidden_size = 60 z_size = 4 x = tf.zeros(shape=[100, 1, hidden_size], dtype=tf.float32) with tf.variable_scope("test2", reuse=tf.AUTO_REUSE): means = tf.get_variable("means", shape=[1, 1, 2**z_size, hidden_size], initializer=tf.constant_initializer(0.), dtype=tf.float32) ema_count = [] ema_count_i = tf.get_variable( "ema_count", [1, 2**z_size], initializer=tf.constant_initializer(0), trainable=False) ema_count.append(ema_count_i) ema_means = [] with tf.colocate_with(means): ema_means_i = tf.get_variable("ema_means", initializer=means.initialized_value()[0], trainable=False) ema_means.append(ema_means_i) cond = tf.cast(0.0, tf.bool) x_means_dense, x_means_hot, _, _, _ = discretization.discrete_bottleneck( x, hidden_size, z_size, 32, means=means, num_blocks=1, cond=cond, ema_means=ema_means, ema_count=ema_count, name="test2") with self.test_session() as sess: sess.run(tf.global_variables_initializer()) x_means_dense_eval, x_means_hot_eval = sess.run( [x_means_dense, x_means_hot]) means_eval = sess.run(means) self.assertEqual(x_means_dense_eval.shape, (100, 1, hidden_size)) self.assertEqual(x_means_hot_eval.shape, (100, 1)) self.assertAllClose(means_eval, np.zeros((1, 1, 2**z_size, hidden_size)))
def preprocess_device_grads(self, device_grads): compact_grads = (self.benchmark_cnn.params.use_fp16 and self.benchmark_cnn.params.compact_gradient_transfer) defer_grads = ( self.benchmark_cnn.params.variable_consistency == 'relaxed') grads_to_reduce = [[g for g, _ in grad_vars] for grad_vars in device_grads] algorithm = batch_allreduce.algorithm_from_params( self.benchmark_cnn.params) reduced_grads, self._warmup_ops = algorithm.batch_all_reduce( grads_to_reduce, self.benchmark_cnn.params.gradient_repacking, compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile) if self.benchmark_cnn.enable_auto_loss_scale: # Check for infs or nans is_finite_list = [] with tf.name_scope('check_for_inf_and_nan'): for tower_grads in reduced_grads: with tf.colocate_with(tower_grads[0]): # TODO(tanmingxing): Create fused op that takes in a list of tensors # as input and returns scalar boolean True if there are any # infs/nans. is_finite_list.append( tf.reduce_all([ tf.reduce_all(tf.is_finite(g)) for g in tower_grads ])) self.grad_has_inf_nan = tf.logical_not( tf.reduce_all(is_finite_list)) reduced_device_grads = [[ (g, v) for g, (_, v) in zip(grads, grad_vars) ] for grads, grad_vars in zip(reduced_grads, device_grads)] return self.benchmark_cnn.devices, reduced_device_grads
def learning_rate_schedule(params, global_step): """Handles learning rate scaling, linear warmup, and learning rate decay. Args: params: A dictionary that defines hyperparameters of model. global_step: A tensor representing current global step. Returns: A tensor representing current learning rate. """ base_learning_rate = params['base_learning_rate'] lr_warmup_step = params['lr_warmup_step'] first_lr_drop_step = params['first_lr_drop_step'] second_lr_drop_step = params['second_lr_drop_step'] scaling_factor = params['global_batch_size'] / constants.DEFAULT_BATCH_SIZE adjusted_learning_rate = base_learning_rate * scaling_factor with tf.colocate_with(global_step): learning_rate = (tf.cast(global_step, dtype=tf.float32) / lr_warmup_step) * adjusted_learning_rate learning_rate = tf.where(global_step < lr_warmup_step, learning_rate, adjusted_learning_rate * 1.0, name="learning_rate_schedule_1") learning_rate = tf.where(global_step < first_lr_drop_step, learning_rate, adjusted_learning_rate * 0.1, name="learning_rate_schedule_2") learning_rate = tf.where(global_step < second_lr_drop_step, learning_rate, adjusted_learning_rate * 0.01, name="learning_rate") return learning_rate
def _finish(self, update_ops, name_scope): """Updates beta_power variables every n batches and incrs counter.""" iter_ = self._get_iter_variable() beta1_power, beta2_power = self._get_beta_accumulators() with tf.control_dependencies(update_ops): with tf.colocate_with(iter_): def update_beta_op(): update_beta1 = beta1_power.assign( beta1_power * self._beta1_t, use_locking=self._use_locking) update_beta2 = beta2_power.assign( beta2_power * self._beta2_t, use_locking=self._use_locking) return tf.group(update_beta1, update_beta2) maybe_update_beta = tf.cond(tf.equal(iter_, 0), update_beta_op, tf.no_op) with tf.control_dependencies([maybe_update_beta]): # TODO(cuong): It is suboptimal here because we have to cast twice # (float to int, and then int to float) update_iter = iter_.assign(tf.cast( tf.mod(tf.cast(iter_ + 1.0, tf.int32), self._n_t), tf.float32), use_locking=self._use_locking) return tf.group(*update_ops + [update_iter, maybe_update_beta], name=name_scope)
def __init__(self, hparams): self.hparams = hparams print("self.hparams.z_size", self.hparams.z_size) # Set the discretization bottleneck specific things here self.hparams.z_size_per_residual = self.hparams.z_size // \ self.hparams.num_residuals print("self.hparams.num_residuals", self.hparams.num_residuals) self.hparams.block_dim = int(self.hparams.model_d // self.hparams.num_blocks) self.hparams.block_v_size = 2**(self.hparams.z_size_per_residual / self.hparams.num_blocks) self.hparams.block_v_size = int(self.hparams.block_v_size) self.means = tf.get_variable( name="means", shape=[ self.hparams.num_blocks, self.hparams.block_v_size, self.hparams.block_dim ], initializer=tf.initializers.variance_scaling( distribution="uniform")) # Create the shadow variables if we are using EMA if self.hparams.ema: self.ema_count = tf.get_variable( "ema_count", [self.hparams.num_blocks, self.hparams.block_v_size], initializer=tf.constant_initializer(0), trainable=False) with tf.colocate_with(self.means): self.ema_means = tf.get_variable( "ema_means", initializer=self.means.initialized_value(), trainable=False)
def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True): """Applies a function to each tensor in `all_device_tensors`. A new list of lists of tensors is returned, where every tensor in `all_device_tensors` has had `apply_func` called on it. `all_device_tensors` is not modified. Args: all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is a tensor where `i` is the device index and `j` is the tensor index. apply_func: A function taking in three arguments: tensor, device_index, tensor_index, and returning a modified tensor. `tensor` is `all_device_tensors[device_index][tensor_index]`. colocate: If True, apply_func will be run under context manager colocated with it's input tensor. Returns: A list in the same form as `all_device_tensors`, except each tensor has had `apply_func` called on it. """ new_all_device_tensors = [] for device_index, device_tensors in enumerate(all_device_tensors): new_device_tensors = [] for tensor_index, t in enumerate(device_tensors): if colocate: with tf.colocate_with(t): new_t = apply_func(t, device_index, tensor_index) else: new_t = apply_func(t, device_index, tensor_index) new_device_tensors.append(new_t) new_all_device_tensors.append(new_device_tensors) return new_all_device_tensors
def _transform_2(image): with tf.colocate_with(image): crop_default = tf.constant([0.0, 0.0, 1.0, 1.0]) image = tf.image.decode_jpeg(image, channels=3) image = tf.image.convert_image_dtype(image, dtype=tf.float32) return tf.image.resize_bicubic([image], [height, width])[0], crop_default
def _get_grads_lists_exact(self, tensors): if self.mat_type == "Fisher": # pylint: disable=g-long-lambda mult_func = (lambda loss, index: loss. multiply_fisher_factor_replicated_one_hot(index)) inner_shape_func = lambda loss: loss.fisher_factor_inner_static_shape elif self.mat_type == "GGN": # pylint: disable=g-long-lambda mult_func = (lambda loss, index: loss. multiply_ggn_factor_replicated_one_hot(index)) inner_shape_func = lambda loss: loss.fisher_ggn_inner_static_shape # Loop over all coordinates of all losses. grads_all = [] for loss in self.layers.losses: with tf.colocate_with(self.layers.loss_colocation_ops[loss]): for index in np.ndindex(*inner_shape_func(loss)[1:]): value = mult_func(loss, index) coeff = tf.cast(self.layers.loss_coeffs[loss], dtype=value.dtype) transformed_one_hot = tf.sqrt(coeff) * value grads_flat = tf.gradients(loss.inputs, nest.flatten(tensors), grad_ys=transformed_one_hot, colocate_gradients_with_ops=self. _colocate_gradients_with_ops) grads_all.append(nest.pack_sequence_as( tensors, grads_flat)) return tuple(zip(*grads_all))
def _finish(self, update_ops, name_scope): # Update the power accumulators. with tf.control_dependencies(update_ops): beta1_power, beta2_power = self._get_beta_accumulators() with tf.colocate_with(beta1_power): update_beta1 = beta1_power.assign( beta1_power * self._beta1_t, use_locking=self._use_locking) update_beta2 = beta2_power.assign( beta2_power * self._beta2_t, use_locking=self._use_locking) return tf.group(*update_ops + [update_beta1, update_beta2], name=name_scope)
def _multiply_across_losses(self, mult_func, vecs, coeff_mode="regular"): products = [] for loss, vec in zip(self._losses, vecs): with tf.colocate_with(self._loss_colocation_ops[loss]): if coeff_mode == "regular": multiplier = self._get_loss_coeff(loss) elif coeff_mode == "sqrt": multiplier = tf.sqrt(self._get_loss_coeff(loss)) val = mult_func(loss, vec) products.append(tf.cast(multiplier, dtype=val.dtype) * val) return tuple(products)
def model_fn(features, labels, mode, params): """Model computational graph.""" del labels del params total_loss, monitor_dict = eval(FLAGS.loss_type)(features, mode) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info("#params: %d", num_params) if FLAGS.verbose: format_str = "{{:<{0}s}}\t{{}}".format( max([len(v.name) for v in tf.trainable_variables()])) for v in tf.trainable_variables(): tf.logging.info(format_str.format(v.name, v.get_shape())) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: #### Reduce sum losses from all TPU cores with tf.colocate_with(total_loss): total_loss = tf.tpu.cross_replica_sum(total_loss) total_loss = total_loss / FLAGS.num_hosts / FLAGS.num_core_per_host metric_loss = tf.reshape(total_loss, [1]) #### Constructing evaluation TPUEstimatorSpec with new cache. eval_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, [metric_loss])) return eval_spec #### Get the train op train_op, optim_dict = optimization.get_train_op(total_loss) monitor_dict.update(optim_dict) #### Customized initial checkpoint scaffold_fn = model_utils.custom_initialization(FLAGS.init_global_vars) #### Creating host calls host_call = model_utils.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) #### Constructing training TPUEstimatorSpec with new cache. train_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) return train_spec
def init_vq_bottleneck(bottleneck_size, hidden_size): """Get lookup table for VQ bottleneck.""" means = tf.get_variable(name="means", shape=[bottleneck_size, hidden_size], initializer=tf.uniform_unit_scaling_initializer()) ema_count = tf.get_variable(name="ema_count", shape=[bottleneck_size], initializer=tf.constant_initializer(0), trainable=False) with tf.colocate_with(means): ema_means = tf.get_variable(name="ema_means", initializer=means.initialized_value(), trainable=False) return means, ema_means, ema_count
def _create_slots(self, var_list): for v in var_list: with tf.colocate_with(v): if self._momentum > 0: self._zeros_slot(v, "momentum", self._name) shape = np.array(v.get_shape()) var_rank = len(shape) # We special case vectors and scalars as we can run the diagonal adagrad # update for those parameters. if var_rank > 1: for i, d in enumerate(shape): d_tensor = tf.convert_to_tensor(d) diag_init = tf.zeros([d_tensor]) _ = self._get_or_make_slot(v, diag_init, "accumulator_" + str(i), self._name) else: _ = self._zeros_slot(v, "accumulator", self._name)
def _prepare_variables(self): """Prepare Variables for YellowFin. Returns: Grad**2, Norm, Norm**2, Mean(Norm**2) ops """ self._moving_averager = tf.train.ExponentialMovingAverage( decay=self._beta, zero_debias=self._zero_debias) # assert self._grad is not None and len(self._grad) > 0 # List for the returned Operations prepare_variables_op = [] # Get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] # Gradient squared for v, g in zip(self._vars, self._grad): if g is None: continue with tf.colocate_with(v): self._grad_squared.append(tf.square(g)) # Norm squared. self._grad_norm_squared = [ tf.reduce_sum(g_sq) for g_sq in self._grad_squared ] if self._sparsity_debias: avg_op_sparsity = self._grad_sparsity() prepare_variables_op.append(avg_op_sparsity) # The following running average on squared norm of gradient # is shared by grad_var and dist_to_opt avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op]): self._grad_norm_squared_avg = [ self._moving_averager.average(val) for val in self._grad_norm_squared ] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) prepare_variables_op.append(avg_op) return tf.group(*prepare_variables_op)
def _get_transformed_random_signs(self): if self.mat_type == "Fisher": mult_func = lambda loss, index: loss.multiply_fisher_factor(index) inner_shape_func = lambda loss: loss.fisher_factor_inner_shape elif self.mat_type == "GGN": mult_func = lambda loss, index: loss.multiply_ggn_factor(index) inner_shape_func = lambda loss: loss.ggn_factor_inner_shape transformed_random_signs = [] for loss in self.layers.losses: with tf.colocate_with(self.layers.loss_colocation_ops[loss]): value = mult_func( loss, utils.generate_random_signs(inner_shape_func(loss), dtype=loss.dtype)) coeff = tf.cast(self.layers.loss_coeffs[loss], dtype=value.dtype) transformed_random_signs.append(tf.sqrt(coeff) * value) return transformed_random_signs
def _finish(self, update_ops, name_scope): """Updates beta_power variables every n batches and incrs counter.""" iter_ = self._get_iter_variable() beta1_power, beta2_power = self._get_beta_accumulators() with tf.control_dependencies(update_ops): with tf.colocate_with(iter_): def update_beta_op(): update_beta1 = beta1_power.assign( beta1_power * self._beta1_t, use_locking=self._use_locking) update_beta2 = beta2_power.assign( beta2_power * self._beta2_t, use_locking=self._use_locking) return tf.group(update_beta1, update_beta2) maybe_update_beta = tf.cond( tf.equal(iter_, 0), update_beta_op, tf.no_op) with tf.control_dependencies([maybe_update_beta]): update_iter = iter_.assign(tf.mod(iter_ + 1, self._n_t), use_locking=self._use_locking) return tf.group( *update_ops + [update_iter, maybe_update_beta], name=name_scope)
def __init__(self, *args, **kwargs): super(TransformerAE, self).__init__(*args, **kwargs) self.predict_mask = 1.0 # Define bottleneck function self._hparams.bottleneck = functools.partial( discretization.discrete_bottleneck, hidden_size=self._hparams.hidden_size, z_size=self._hparams.z_size, filter_size=self._hparams.filter_size, bottleneck_kind=self._hparams.bottleneck_kind, num_blocks=self._hparams.num_blocks, num_residuals=self.hparams.num_residuals, reshape_method=self._hparams.reshape_method, beta=self._hparams.beta, ema=self._hparams.ema, epsilon=self._hparams.epsilon, decay=self._hparams.decay, random_top_k=self._hparams.random_top_k, soft_em=self.hparams.soft_em, num_samples=self.hparams.num_samples, softmax_k=self._hparams.softmax_k, temperature_warmup_steps=self._hparams.temperature_warmup_steps, do_hard_gumbel_softmax=self._hparams.do_hard_gumbel_softmax, num_flows=self._hparams.num_flows, approximate_gs_entropy=self._hparams.approximate_gs_entropy, discrete_mix=self._hparams.d_mix, noise_dev=self._hparams.noise_dev, startup_steps=self.hparams.startup_steps, summary=_DO_SUMMARIES) # Set the discretization bottleneck specific things here if self._hparams.bottleneck_kind in ["dvq", "gumbel-softmax-dvq"]: z_size_per_residual = self._hparams.z_size / self._hparams.num_residuals block_dim = int(self._hparams.hidden_size // self._hparams.num_blocks) block_v_size = 2**(z_size_per_residual / self._hparams.num_blocks) block_v_size = int(block_v_size) if self._hparams.reshape_method == "project": tf.logging.info("Using projections for DVQ") tf.logging.info("Trainable projections = {}".format( self._hparams.trainable_projections)) projection_tensors = tf.get_variable( name="projection", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, self._hparams.hidden_size, block_dim ], initializer=tf.initializers.glorot_uniform(), trainable=self._hparams.trainable_projections) self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, projection_tensors=projection_tensors) elif self._hparams.reshape_method == "slice": tf.logging.info("Using slices for DVQ") else: raise ValueError("Unknown reshape method") means = tf.get_variable( name="means", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, block_v_size, block_dim ], initializer=tf.uniform_unit_scaling_initializer()) # Create the shadow variables if we are using EMA ema_count = None ema_means = None if self._hparams.ema: ema_count = [] for i in range(self._hparams.num_residuals): ema_count_i = tf.get_variable( "ema_count_{}".format(i), [self._hparams.num_blocks, block_v_size], initializer=tf.constant_initializer(0), trainable=False) ema_count.append(ema_count_i) with tf.colocate_with(means): ema_means = [] for i in range(self._hparams.num_residuals): ema_means_i = tf.get_variable( "ema_means_{}".format(i), [ self._hparams.num_blocks, block_v_size, block_dim ], initializer=( lambda shape, dtype=None, partition_info=None, # pylint: disable=g-long-lambda verify_shape=None: means.initialized_value()[i] ), trainable=False) ema_means.append(ema_means_i) # Update bottleneck self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, means=means, ema_count=ema_count, ema_means=ema_means)
def _clip_by_global_norm(t_list, clip_norm, use_norm, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. The global norm is expected to be pre-computed and passed as use_norm. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. """ if not isinstance(t_list, collections.Sequence) or isinstance( t_list, six.string_types): raise TypeError('t_list should be a sequence') t_list = list(t_list) # Removed as use_norm should always be passed # if use_norm is None: # use_norm = global_norm(t_list, name) with tf.name_scope(name, 'clip_by_global_norm', t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale = clip_norm * tf.minimum( 1.0 / use_norm, tf.ones([1], dtype=use_norm.dtype) / clip_norm) values = [ tf.cast( tf.convert_to_tensor( t.values if isinstance(t, tf.IndexedSlices) else t, name='t_%d' % i, ), dtype=tf.float32, ) if t is not None else t for i, t in enumerate(t_list) ] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with tf.colocate_with(v): values_clipped.append( tf.identity(v * scale, name='%s_%d' % (name, i))) list_clipped = [ tf.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance( t, tf.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list) ] return list_clipped, use_norm
def _transform_2(image): with tf.colocate_with(image): crop_default = tf.constant([0.0, 0.0, 1.0, 1.0]) return tf.image.resize_bicubic([image], [height, width])[0], crop_default
def assign_log_moving_mean_exp(log_mean_exp_var, log_value, decay, name=None): """Compute the log of the exponentially weighted moving mean of the exp. If `log_value` is a draw from a stationary random variable, this function approximates `log(E[exp(log_value)])`, i.e., a weighted log-sum-exp. More precisely, a `tf.Variable`, `log_mean_exp_var`, is updated by `log_value` using the following identity: ```none log_mean_exp_var = = log(decay exp(log_mean_exp_var) + (1 - decay) exp(log_value)) = log(exp(log_mean_exp_var + log(decay)) + exp(log_value + log1p(-decay))) = log_mean_exp_var + log( exp(log_mean_exp_var - log_mean_exp_var + log(decay)) + exp(log_value - log_mean_exp_var + log1p(-decay))) = log_mean_exp_var + log_sum_exp([log(decay), log_value - log_mean_exp_var + log1p(-decay)]). ``` In addition to numerical stability, this formulation is advantageous because `log_mean_exp_var` can be updated in a lock-free manner, i.e., using `assign_add`. (Note: the updates are not thread-safe; it's just that the update to the tf.Variable is presumed efficient due to being lock-free.) Args: log_mean_exp_var: `float`-like `Variable` representing the log of the exponentially weighted moving mean of the exp. Same shape as `log_value`. log_value: `float`-like `Tensor` representing a new (streaming) observation. Same shape as `log_mean_exp_var`. decay: A `float`-like `Tensor`. The moving mean decay. Typically close to `1.`, e.g., `0.999`. name: Optional name of the returned operation. Returns: log_mean_exp_var: A reference to the input 'Variable' tensor with the `log_value`-updated log of the exponentially weighted moving mean of exp. Raises: TypeError: if `log_mean_exp_var` does not have float type `dtype`. TypeError: if `log_mean_exp_var`, `log_value`, `decay` have different `base_dtype`. """ with tf1.name_scope(name, "assign_log_moving_mean_exp", [log_mean_exp_var, log_value, decay]): # We want to update the variable in a numerically stable and lock-free way. # To do this, observe that variable `x` updated by `v` is: # x = log(w exp(x) + (1-w) exp(v)) # = log(exp(x + log(w)) + exp(v + log1p(-w))) # = x + log(exp(x - x + log(w)) + exp(v - x + log1p(-w))) # = x + lse([log(w), v - x + log1p(-w)]) with tf1.colocate_with(log_mean_exp_var): base_dtype = log_mean_exp_var.dtype.base_dtype if not base_dtype.is_floating: raise TypeError( "log_mean_exp_var.base_dtype({}) does not have float type " "`dtype`.".format(base_dtype.name)) log_value = tf.convert_to_tensor(value=log_value, dtype=base_dtype, name="log_value") decay = tf.convert_to_tensor(value=decay, dtype=base_dtype, name="decay") delta = (log_value - log_mean_exp_var)[tf.newaxis, ...] x = tf.concat([ tf.math.log(decay) * tf.ones_like(delta), delta + tf.math.log1p(-decay) ], axis=0) x = tf.reduce_logsumexp(input_tensor=x, axis=0) return log_mean_exp_var.assign_add(x)
def assign_moving_mean_variance(mean_var, variance_var, value, decay, name=None): """Compute exponentially weighted moving {mean,variance} of a streaming value. The `value` updated exponentially weighted moving `mean_var` and `variance_var` are given by the following recurrence relations: ```python variance_var = decay * (variance_var + (1 - decay) * (value - mean_var)**2) mean_var = decay * mean_var + (1 - decay) * value ``` Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses the lag-1 mean. For derivation justification, see [Finch (2009; Eq. 143)][1]. Parameterization: Finch's `alpha` is `1 - decay`. Args: mean_var: `float`-like `Variable` representing the exponentially weighted moving mean. Same shape as `variance_var` and `value`. variance_var: `float`-like `Variable` representing the exponentially weighted moving variance. Same shape as `mean_var` and `value`. value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`. decay: A `float`-like `Tensor`. The moving mean decay. Typically close to `1.`, e.g., `0.999`. name: Optional name of the returned operation. Returns: mean_var: `Variable` representing the `value`-updated exponentially weighted moving mean. variance_var: `Variable` representing the `value`-updated exponentially weighted moving variance. Raises: TypeError: if `mean_var` does not have float type `dtype`. TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different `base_dtype`. #### References [1]: Tony Finch. Incremental calculation of weighted mean and variance. _Technical Report_, 2009. http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf """ with tf1.name_scope(name, "assign_moving_mean_variance", [variance_var, mean_var, value, decay]): with tf1.colocate_with(variance_var): with tf1.colocate_with(mean_var): base_dtype = mean_var.dtype.base_dtype if not base_dtype.is_floating: raise TypeError( "mean_var.base_dtype({}) does not have float type " "`dtype`.".format(base_dtype.name)) if base_dtype != variance_var.dtype.base_dtype: raise TypeError( "mean_var.base_dtype({}) != variance_var.base_dtype({})" .format(base_dtype.name, variance_var.dtype.base_dtype.name)) value = tf.convert_to_tensor(value=value, dtype=base_dtype, name="value") decay = tf.convert_to_tensor(value=decay, dtype=base_dtype, name="decay") delta = value - mean_var with tf.control_dependencies([delta]): # We want mean_{t+1} = decay * mean_t + (1. - decay) * value # We compute mean += decay * mean_t - mean_t + (1. - decay) * value = # = (1. - decay) * (value - mean_t) mean_var = mean_var.assign_add((1. - decay) * delta) # We want variance_{t+1} = decay * (variance_t + # + (1 - decay) * (value - mean_var)**2). # We compute variance -= variance_t - decay * (variance_t + # + (1 - decay) * (value - mean_var)**2) = # = (1 - decay) * variance_t # - decay * (1 - decay) * (value - mean_var)**2 # = (1 - decay) * (variance_t - decay * (value - mean_var)**2). variance_var = variance_var.assign_sub( (1. - decay) * (variance_var - decay * tf.square(delta))) return mean_var, variance_var