def moving_average_update(variable, value, momentum): try: return moving_averages.assign_moving_average( variable, value, momentum, zero_debias=False) except TypeError: return moving_averages.assign_moving_average( variable, value, momentum)
def __call__(self, input_layer, epsilon=1e-5, decay=0.9, name="batch_norm", in_dim=None, phase=Phase.train): shape = input_layer.shape shp = in_dim or shape[-1] with tf.variable_scope(name) as scope: self.mean = self.variable('mean', [shp], init=tf.constant_initializer(0.), train=False) self.variance = self.variable('variance', [shp], init=tf.constant_initializer(1.0), train=False) self.gamma = self.variable("gamma", [shp], init=tf.random_normal_initializer(1., 0.02)) self.beta = self.variable("beta", [shp], init=tf.constant_initializer(0.)) if phase == Phase.train: mean, variance = tf.nn.moments(input_layer.tensor, [0, 1, 2]) mean.set_shape((shp,)) variance.set_shape((shp,)) update_moving_mean = moving_averages.assign_moving_average(self.mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average(self.variance, variance, decay) with tf.control_dependencies([update_moving_mean, update_moving_variance]): normalized_x = tf.nn.batch_norm_with_global_normalization( input_layer.tensor, mean, variance, self.beta, self.gamma, epsilon, scale_after_normalization=True) else: normalized_x = tf.nn.batch_norm_with_global_normalization( input_layer.tensor, self.mean, self.variance, self.beta, self.gamma, epsilon, scale_after_normalization=True) return input_layer.with_tensor(normalized_x, parameters=self.vars)
def bacthnorm(inputs, scope, epsilon=1e-05, momentum=0.99, is_training=True): inputs_shape = inputs.get_shape().as_list()# 输出 形状尺寸 params_shape = inputs_shape[-1:]# 输入参数的长度 axis = list(range(len(inputs_shape) - 1)) with tf.variable_scope(scope): beta = create_variable("beta", params_shape, initializer=tf.zeros_initializer()) gamma = create_variable("gamma", params_shape, initializer=tf.ones_initializer()) # 均值 常量 不需要训练 for inference moving_mean = create_variable("moving_mean", params_shape, initializer=tf.zeros_initializer(), trainable=False) # 方差 常量 不需要训练 moving_variance = create_variable("moving_variance", params_shape, initializer=tf.ones_initializer(), trainable=False) if is_training: mean, variance = tf.nn.moments(inputs, axes=axis)# 计算均值和方差 # 移动平均求 均值和 方差 考虑上一次的量 xt = a * x_t-1 +(1-a)*x_now update_move_mean = moving_averages.assign_moving_average(moving_mean, mean, decay=momentum) update_move_variance = moving_averages.assign_moving_average(moving_variance, variance, decay=momentum) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_mean) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_variance) else: mean, variance = moving_mean, moving_variance return tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon)
def bn(x, c): x_shape = x.get_shape() params_shape = x_shape[-1:] if c["use_bias"]: bias = _get_variable("bias", params_shape, initializer=tf.zeros_initializer) return x + bias axis = list(range(len(x_shape) - 1)) beta = _get_variable("beta", params_shape, initializer=tf.zeros_initializer) gamma = _get_variable("gamma", params_shape, initializer=tf.ones_initializer) moving_mean = _get_variable("moving_mean", params_shape, initializer=tf.zeros_initializer, trainable=False) moving_variance = _get_variable("moving_variance", params_shape, initializer=tf.ones_initializer, trainable=False) # These ops will only be preformed when training. mean, variance = tf.nn.moments(x, axis) update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, BN_DECAY) update_moving_variance = moving_averages.assign_moving_average(moving_variance, variance, BN_DECAY) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance) mean, variance = control_flow_ops.cond( c["is_training"], lambda: (mean, variance), lambda: (moving_mean, moving_variance) ) x = tf.nn.batch_normalization(x, mean, variance, beta, gamma, BN_EPSILON) # x.set_shape(inputs.get_shape()) ?? return x
def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance
def batch_norm(x, decay=0.999, epsilon=1e-03, is_training=True, scope="scope"): x_shape = x.get_shape() num_inputs = x_shape[-1] reduce_dims = list(range(len(x_shape) - 1)) with tf.variable_scope(scope): beta = create_var("beta", [num_inputs,], initializer=tf.zeros_initializer()) gamma = create_var("gamma", [num_inputs,], initializer=tf.ones_initializer()) # for inference moving_mean = create_var("moving_mean", [num_inputs,], initializer=tf.zeros_initializer(), trainable=False) moving_variance = create_var("moving_variance", [num_inputs], initializer=tf.ones_initializer(), trainable=False) if is_training: mean, variance = tf.nn.moments(x, axes=reduce_dims) update_move_mean = moving_averages.assign_moving_average(moving_mean, mean, decay=decay) update_move_variance = moving_averages.assign_moving_average(moving_variance, variance, decay=decay) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_mean) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_variance) else: mean, variance = moving_mean, moving_variance return tf.nn.batch_normalization(x, mean, variance, beta, gamma, epsilon)
def mean_var_with_update(): mean, variance = tf.nn.moments(x, list(range(len(x.shape) - 1)), name='moments') with tf.control_dependencies([ assign_moving_average(moving_mean, mean, decay), assign_moving_average(moving_var, variance, decay) ]): return tf.identity(mean), tf.identity(variance)
def train_phase(): mean, variance = tf.nn.moments(inputs, axis) update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average(moving_variance, variance, decay) with tf.control_dependencies([update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance)
def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies([update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity(variance)
def __init__(self, value, decay, weight, truediv=True, collections=None, name=None): """Compute the weighted moving average of `value`. Conceptually, the weighted moving average is: `moving_average(value * weight) / moving_average(weight)`, where a moving average updates by the rule `new_value = decay * old_value + (1 - decay) * update` Internally, this Op keeps moving average variables of both `value * weight` and `weight`. Args: value: A numeric `Tensor`. decay: A float `Tensor` or float value. The moving average decay. weight: `Tensor` that keeps the current value of a weight. Shape should be able to multiply `value`. truediv: Boolean, if `True`, dividing by `moving_average(weight)` is floating point division. If `False`, use division implied by dtypes. collections: List of graph collections keys to add the internal variables `value * weight` and `weight` to. Defaults to `[GraphKeys.VARIABLES]`. name: Optional name of the returned operation. Defaults to "WeightedMovingAvg". Returns: An Operation that updates and returns the weighted moving average. """ # Unlike assign_moving_average, the weighted moving average doesn't modify # user-visible variables. It is the ratio of two internal variables, which are # moving averages of the updates. Thus, the signature of this function is # quite different than assign_moving_average. if collections is None: collections = [ops.GraphKeys.VARIABLES] with variable_scope.variable_op_scope( [value, weight, decay], name, "WeightedMovingAvg") as scope: value_x_weight_var = variable_scope.get_variable( "value_x_weight", initializer=init_ops.zeros_initializer(value.get_shape(), dtype=value.dtype), trainable=False, collections=collections) weight_var = variable_scope.get_variable( "weight", initializer=init_ops.zeros_initializer(weight.get_shape(), dtype=weight.dtype), trainable=False, collections=collections) numerator = assign_moving_average(value_x_weight_var, value * weight, decay) denominator = assign_moving_average(weight_var, weight, decay) if truediv: div = math_ops.truediv else: div = math_ops.div self.average_with_update = div(numerator, denominator+1e-8, name=scope.name) self.average = div(value_x_weight_var, weight_var)
def update_mean_var(): mean, variance = tf.nn.moments(x=incoming, axes=axis) update_moving_mean = moving_averages.assign_moving_average( variable=moving_mean, value=mean, decay=self.decay, zero_debias=False) update_moving_variance = moving_averages.assign_moving_average( variable=moving_variance, value=variance, decay=self.decay, zero_debias=False) with tf.control_dependencies([update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance)
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): """Batch normalization on `input_layer` without tf.layers.""" shape = input_layer.shape num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] beta = self.get_variable( 'beta', [num_channels], tf.float32, tf.float32, initializer=tf.zeros_initializer()) if use_scale: gamma = self.get_variable( 'gamma', [num_channels], tf.float32, tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, tf.float32, [num_channels]) moving_mean = tf.get_variable( 'moving_mean', [num_channels], tf.float32, initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable( 'moving_variance', [num_channels], tf.float32, initializer=tf.ones_initializer(), trainable=False) if self.phase_train: bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( input_layer, gamma, beta, epsilon=epsilon, data_format=self.data_format, is_training=True) mean_update = moving_averages.assign_moving_average( moving_mean, batch_mean, decay=decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( moving_variance, batch_variance, decay=decay, zero_debias=False) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) else: bn, _, _ = tf.nn.fused_batch_norm( input_layer, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, data_format=self.data_format, is_training=False) return bn
def _update_mean_var(): """Internal function that updates mean and variance during training.""" axis = [0, 1, 2] if convnet else [0] mean, var = nn.moments(tensor_in, axis) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_var = moving_averages.assign_moving_average( moving_var, var, decay) with ops.control_dependencies([update_moving_mean, update_moving_var]): return array_ops_.identity(mean), array_ops_.identity(var)
def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving value. new_var = moving_averages.assign_moving_average( var, value, decay, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, 1., decay, zero_debias=False) return new_var / new_weight
def _batch_norm(self, name, x): with tf.variable_scope(name): # 输入通道维数 params_shape = [x.get_shape()[-1]] # offset beta = tf.get_variable('beta', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32)) # scale gamma = tf.get_variable('gamma', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32)) if self.mode == 'train': # 为每个通道计算均值、标准差 mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') # 新建或建立测试阶段使用的batch均值、标准差 moving_mean = tf.get_variable('moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) moving_variance = tf.get_variable('moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) # 添加batch均值和标准差的更新操作(滑动平均) # moving_mean = moving_mean * decay + mean * (1 - decay) # moving_variance = moving_variance * decay + variance * (1 - decay) self._extra_train_ops.append(moving_averages.assign_moving_average( moving_mean, mean, 0.9)) self._extra_train_ops.append(moving_averages.assign_moving_average( moving_variance, variance, 0.9)) else: # 获取训练中积累的batch均值、标准差 mean = tf.get_variable('moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) variance = tf.get_variable('moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) # 添加到直方图总结 tf.summary.histogram(mean.op.name, mean) tf.summary.histogram(variance.op.name, variance) # BN层:((x-mean)/var)*gamma+beta y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001) y.set_shape(x.get_shape()) return y
def testAssignMovingAverageNewNamingMultipleCalls(self): with variable_scope.variable_scope("scope1") as vs1: with variable_scope.variable_scope("scope2"): var = variables.Variable(1.0, name="Var") moving_averages.assign_moving_average(var, 0.0, 0.99) moving_averages.assign_moving_average(var, 0.0, 0.99) expected_names = ["scope1/scope2/Var:0", "scope1/scope2/scope1/scope2/Var/biased:0", "scope1/scope2/scope1/scope2/Var/local_step:0", "scope1/scope2/scope1/scope2/Var/biased_1:0", "scope1/scope2/scope1/scope2/Var/local_step_1:0"] actual_names = [v.name for v in vs1.global_variables()] self.assertSetEqual(set(expected_names), set(actual_names))
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output')
def _do_update(): # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving the value. # Make sure the weight is not updated until before r and d computation. with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average( var, value, self.renorm_momentum, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, self.renorm_momentum, zero_debias=False) return new_var / new_weight
def __init__(self, value, decay, truediv=True, collections=None, reduction_indices=None, name=None): self.value = value self.reduction_indices = reduction_indices or [0] eps = 1e-8 if truediv: div = math_ops.truediv else: div = math_ops.div if collections is None: collections = [ops.GraphKeys.VARIABLES] value_shape = value.get_shape().as_list() shape = [] for dim in range(len(value_shape)): if dim in self.reduction_indices: shape.append(1) else: shape.append(value_shape[dim]) with variable_scope.variable_op_scope( [value, decay], name, "MomentTracker") as scope: mean_x_weight_var = variable_scope.get_variable("mean_x_weight", trainable=False, collections=collections, initializer=init_ops.zeros_initializer(shape, dtype=value.dtype)) variance_x_weight_var = variable_scope.get_variable("variance_x_weight", trainable=False, collections=collections, initializer=init_ops.zeros_initializer(shape, dtype=value.dtype)) weight_var = variable_scope.get_variable("weight", trainable=False, collections=collections, initializer=init_ops.zeros_initializer([1], dtype=tf.float32)) self.tracked_mean = div(mean_x_weight_var, weight_var + eps) self.tracked_variance = div(variance_x_weight_var, weight_var + eps) self.batch_mean, self.batch_variance = tf.nn.moments(self.value, axes=self.reduction_indices, shift=self.tracked_mean, keep_dims=True) mean_numerator = assign_moving_average(mean_x_weight_var, self.batch_mean, decay) variance_numerator = assign_moving_average(variance_x_weight_var, self.batch_variance, decay) denominator = assign_moving_average(weight_var, 1.0, decay) self.update_mean = div(mean_numerator, denominator + eps, name=scope.name) self.update_variance = div(variance_numerator, denominator + eps, name=scope.name)
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay, internal_update): update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') if internal_update: with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output') else: tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) return tf.identity(xn, name='output')
def forward(self, inputs): mean, var = tf.nn.moments(inputs, self.axes) if self.is_train: # update moving_mean and moving_var self.moving_mean = moving_averages.assign_moving_average( self.moving_mean, mean, self.decay, zero_debias=False ) self.moving_var = moving_averages.assign_moving_average(self.moving_var, var, self.decay, zero_debias=False) outputs = batch_normalization(inputs, mean, var, self.beta, self.gamma, self.epsilon, self.data_format) else: outputs = batch_normalization( inputs, self.moving_mean, self.moving_var, self.beta, self.gamma, self.epsilon, self.data_format ) if self.act: outputs = self.act(outputs) return outputs
def replica_fn(): var = variables.Variable([0.0, 0.0]) val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]]) replica_id[0] += 1 decay = 0.25 assign = moving_averages.assign_moving_average(var, val, decay) return var, assign.op
def testCrossDeviceWithoutZeroDebias(self, distribution): with distribution.scope(), self.cached_session() as sess: var = variables.Variable([10.0, 11.0]) val = constant_op.constant([1.0, 2.0]) decay = 0.25 # NOTE(josh11b): We currently generate an error if val is a PerReplica # value. assign = moving_averages.assign_moving_average( var, val, decay, zero_debias=False) variables.global_variables_initializer().run() self.assertAllClose([10.0, 11.0], var.eval()) sess.run(assign) average_val = [1.0, 2.0] val_weight = 1.0 - 0.25 self.assertAllClose( [10.0 * 0.25 + average_val[0] * val_weight, 11.0 * 0.25 + average_val[1] * val_weight], var.eval()) # Also try assign.op. sess.run(assign.op) orig_weight = 0.25 * 0.25 val_weight = 1.0 - orig_weight self.assertAllClose( [10.0 * orig_weight + average_val[0] * val_weight, 11.0 * orig_weight + average_val[1] * val_weight], var.eval())
def weighted_resample(inputs, weights, overall_rate, scope=None, mean_decay=0.999, warmup=10, seed=None): """Performs an approximate weighted resampling of `inputs`. This method chooses elements from `inputs` where each item's rate of selection is proportional to its value in `weights`, and the average rate of selection across all inputs (and many invocations!) is `overall_rate`. Args: inputs: A list of tensors whose first dimension is `batch_size`. weights: A `[batch_size]`-shaped tensor with each batch member's weight. overall_rate: Desired overall rate of resampling. scope: Scope to use for the op. mean_decay: How quickly to decay the running estimate of the mean weight. warmup: Until the resulting tensor has been evaluated `warmup` times, the resampling menthod uses the true mean over all calls as its weight estimate, rather than a decayed mean. seed: Random seed. Returns: A list of tensors exactly like `inputs`, but with an unknown (and possibly zero) first dimension. A tensor containing the effective resampling rate used for each output. """ # Algorithm: Just compute rates as weights/mean_weight * # overall_rate. This way the average weight corresponds to the # overall rate, and a weight twice the average has twice the rate, # etc. with ops.name_scope(scope, 'weighted_resample', inputs) as opscope: # First: Maintain a running estimated mean weight, with decay # adjusted (by also maintaining an invocation count) during the # warmup period so that at the beginning, there aren't too many # zeros mixed in, throwing the average off. with variable_scope.variable_scope(scope, 'estimate_mean', inputs): count_so_far = variable_scope.get_local_variable( 'resample_count', initializer=0) estimated_mean = variable_scope.get_local_variable( 'estimated_mean', initializer=0.0) count = count_so_far.assign_add(1) real_decay = math_ops.minimum( math_ops.truediv((count - 1), math_ops.minimum(count, warmup)), mean_decay) batch_mean = math_ops.reduce_mean(weights) mean = moving_averages.assign_moving_average( estimated_mean, batch_mean, real_decay, zero_debias=False) # Then, normalize the weights into rates using the mean weight and # overall target rate: rates = weights * overall_rate / mean results = resample_at_rate([rates] + inputs, rates, scope=opscope, seed=seed, back_prop=False) return (results[1:], results[0])
def _batch_norm(self, name, x): """Batch normalization.""" with tf.variable_scope(name): params_shape = [x.get_shape()[-1]] beta = tf.get_variable( 'beta', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32)) gamma = tf.get_variable( 'gamma', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32)) if self.mode == 'train': mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') moving_mean = tf.get_variable( 'moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) moving_variance = tf.get_variable( 'moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) self._extra_train_ops.append( moving_averages.assign_moving_average(moving_mean, mean, 0.9)) self._extra_train_ops.append( moving_averages.assign_moving_average(moving_variance, variance, 0.9)) else: mean = tf.get_variable( 'moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) variance = tf.get_variable( 'moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) tf.summary.histogram(mean.op.name, mean) tf.summary.histogram(variance.op.name, variance) # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper # net. y = tf.nn.batch_normalization( x, mean, variance, beta, gamma, 0.001) y.set_shape(x.get_shape()) return y
def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self): with variable_scope.variable_scope("scope1") as vs1: var = variable_scope.get_variable("Var", shape=[]) moving_averages.assign_moving_average(var, 0.0, 0.99) moving_averages.assign_moving_average(var, 0.0, 0.99) with variable_scope.variable_scope(vs1, reuse=True): var = variable_scope.get_variable("Var", shape=[]) moving_averages.assign_moving_average(var, 0.0, 0.99) moving_averages.assign_moving_average(var, 0.0, 0.99)
def _fused_batch_norm(self, inputs, training): """Returns the output of fused batch norm.""" beta = self.beta if self.center else self._beta_const gamma = self.gamma if self.scale else self._gamma_const def _fused_batch_norm_training(): return nn.fused_batch_norm( inputs, gamma, beta, epsilon=self.epsilon, data_format=self._data_format) def _fused_batch_norm_inference(): return nn.fused_batch_norm( inputs, gamma, beta, mean=self.moving_mean, variance=self.moving_variance, epsilon=self.epsilon, is_training=False, data_format=self._data_format) output, mean, variance = utils.smart_cond( training, _fused_batch_norm_training, _fused_batch_norm_inference) if not self._bessels_correction_test_only: # Remove Bessel's correction to be consistent with non-fused batch norm. # Note that the variance computed by fused batch norm is # with Bessel's correction. sample_size = math_ops.cast( array_ops.size(inputs) / array_ops.size(variance), variance.dtype) factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size variance *= factor training_value = utils.constant_value(training) if training_value is not False: decay = _smart_select(training, lambda: self.momentum, lambda: 1.) mean_update = moving_averages.assign_moving_average( self.moving_mean, mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, variance, decay, zero_debias=False) self.add_update(mean_update, inputs=inputs) self.add_update(variance_update, inputs=inputs) return output
def replica_fn(): var = variables.Variable([10.0, 11.0]) val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]]) replica_id[0] += 1 decay = 0.25 assign = moving_averages.assign_moving_average( var, val, decay, zero_debias=False) return var, assign
def replica_fn(): var = variables.Variable([10.0, 11.0]) # Here we expect to check the case when input value are variable. val = variables.Variable([1., 2.]) decay = 0.25 assign = moving_averages.assign_moving_average( var, val, decay, zero_debias=False) return var, assign
def moving_average(name, value, decay): moving_average_variable = vs.get_variable( name, shape=value.get_shape(), dtype=value.dtype, initializer=init_ops.zeros_initializer(), trainable=False) return moving_averages.assign_moving_average( moving_average_variable, value, decay, zero_debias=False)
def MovingAvgQuantize(inputs, per_channel=False, init_min=-6.0, init_max=6.0, ema_decay=0.999, vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES, name_prefix='MovingAvgQuantize', reuse=None, is_training=True, num_bits=8, narrow_range=False, symmetric=False): """Adds a layer that collects quantization ranges as EMAs of input ranges. MovingAvgQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (default False) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. ema_decay: EMA decay parameter. vars_collection: (Optional) collection where to store variables for quantization interval ends. name_prefix: name_prefix for created nodes. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. symmetric: If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. Returns: a tensor containing quantized values. """ with variable_scope.variable_scope(None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope: scope.set_partitioner(None) input_shape = inputs.get_shape() if per_channel: input_dim = len(input_shape) # Only support quantizing 1-, 2- and 4-dimensional tensors. assert input_dim in [1, 2, 4 ], ('Expected 1D, 2D or 4D input, was: %s in ' ' scope: %s' % (input_shape, name_prefix)) min_max_shape = [input_shape[-1]] else: min_max_shape = [] vars_collections = [vars_collection] if vars_collection else [] min_var = _ModelVariable( 'min', shape=min_max_shape, initializer=init_ops.constant_initializer(init_min), collections=vars_collections, trainable=False) max_var = _ModelVariable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(init_max), collections=vars_collections, trainable=False) if not is_training: return _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min(inputs, axis=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = math_ops.reduce_min(inputs, name='BatchMin') if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max(inputs, axis=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = math_ops.reduce_max(inputs, name='BatchMax') if symmetric: if narrow_range: min_max_ratio = -1 else: # In two's complement notation, the negative range is slightly larger # than the positive range. min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits) # TFLite requires that 0.0 is always in the [min; max] range. Because # batch_min <= batch_max, it follows that range_min <= 0 <= range_max. range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio) range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio) else: # TFLite requires that 0.0 is always in the [min; max] range. range_min = math_ops.minimum(batch_min, 0.0) range_max = math_ops.maximum(batch_max, 0.0) assign_min = moving_averages.assign_moving_average(min_var, range_min, ema_decay, name='AssignMinEma') assign_max = moving_averages.assign_moving_average(max_var, range_max, ema_decay, name='AssignMaxEma') return _FakeQuantWithMinMaxVars(inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)
def add_moving_summary(*args, **kwargs): """ Enable moving average summary for some tensors. It's only effective in the main training tower, otherwise calling this function is a no-op. Args: args: tensors to summary decay (float): the decay rate. Defaults to 0.95. collection (str or None): the name of the collection to add EMA-maintaining ops. The default will work together with the default :class:`MovingAverageSummary` callback. Returns: [tf.Tensor]: list of tensors returned by assign_moving_average, which can be used to maintain the EMA. """ decay = kwargs.pop('decay', 0.95) coll = kwargs.pop('collection', MOVING_SUMMARY_OPS_KEY) assert len(kwargs) == 0, "Unknown arguments: " + str(kwargs) ctx = get_current_tower_context() # allow ctx to be none if ctx is not None and not ctx.is_main_training_tower: return if not isinstance(args[0], list): v = args else: log_deprecated( "Call add_moving_summary with positional args instead of a list!") v = args[0] for x in v: assert isinstance(x, tf.Tensor), x assert x.get_shape().ndims == 0, x.get_shape() G = tf.get_default_graph() # TODO variable not saved under distributed ema_ops = [] for c in v: name = re.sub('tower[0-9]+/', '', c.op.name) with G.colocate_with(c), tf.name_scope(None): # assign_moving_average creates variables with op names, therefore clear ns first. with _enter_vs_reuse_ns('EMA') as vs: ema_var = tf.get_variable( name, shape=c.shape, dtype=c.dtype, initializer=tf.constant_initializer(), trainable=False) ns = vs.original_name_scope with tf.name_scope(ns): # reuse VS&NS so that EMA_1 won't appear ema_op = moving_averages.assign_moving_average( ema_var, c, decay, zero_debias=True, name=name + '_EMA_apply') tf.summary.scalar(name + '-summary', ema_op) # write the EMA value as a summary ema_ops.append(ema_op) if coll is not None: for op in ema_ops: # TODO a new collection to summary every step? tf.add_to_collection(coll, op) return ema_ops
def official_batch_norm(inputs, channels, type=False, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """ Args: inputs: a tensor of size `[batch_size, height, width, channels]` or `[batch_size, channels]`. type: False is non-convolution batch norm,True is convolution batch norm. decay: decay for the moving average. center: If True, subtract `beta`. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. activation_fn: Optional activation function. updates_collections: collections to collect the update ops for computation. is_training: whether or not the layer is in training mode. reuse: whether or not the layer and its variables should be reused. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse) as sc: dtype = inputs.dtype.base_dtype axis = [0, 1, 2] if type else [0] params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None # param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') # beta_initializer = param_initializers.get('beta',init_ops.zeros_initializer) beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') # gamma_initializer = param_initializers.get('gamma',init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=tf.ones_initializer(), collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') # moving_mean_initializer = param_initializers.get('moving_mean', init_ops.zeros_initializer) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') # moving_variance_initializer = param_initializers.get('moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=tf.ones_initializer(), trainable=False, collections=moving_variance_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies( [update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: outputs = nn.batch_normalization(inputs, moving_mean, moving_variance, beta, gamma, epsilon) # TODO:shape # outputs.set_shape(inputs.get_shape()) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def call(self, inputs, training=False): # First, compute the axes along which to reduce the mean / variance, # as well as the broadcast shape to be used for all parameters. input_shape = inputs.get_shape() ndim = len(input_shape) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis].value # Determines whether broadcasting is needed. needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1]) # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if needs_broadcasting: # In this case we must explictly broadcast all parameters. if self.center: broadcast_beta = array_ops.reshape(self.beta, broadcast_shape) else: broadcast_beta = None if self.scale: broadcast_gamma = array_ops.reshape(self.gamma, broadcast_shape) else: broadcast_gamma = None if training_value is not False: # Use a copy of moving_mean as a shift to compute more reliable moments. shift = math_ops.add(self.moving_mean, 0) if needs_broadcasting: shift = array_ops.reshape(shift, broadcast_shape) broadcast_mean, broadcast_variance = nn.moments(inputs, reduction_axes, shift=shift, keep_dims=True) mean = array_ops.reshape(broadcast_mean, [-1]) variance = array_ops.reshape(broadcast_variance, [-1]) else: mean, variance = nn.moments(inputs, reduction_axes, shift=shift) # Prepare updates if necessary. if not self.updates: mean_update = moving_averages.assign_moving_average( self.moving_mean, mean, self.momentum, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, variance, self.momentum, zero_debias=False) # In the future this should be refactored into a self.add_update # methods in order to allow for instance-based BN layer sharing # across unrelated input streams (e.g. like in Keras). self.updates.append(mean_update) self.updates.append(variance_update) # Normalize batch. We do this inside separate functions for training # and inference so as to avoid evaluating both branches. def normalize_in_test(): if needs_broadcasting: broadcast_moving_mean = array_ops.reshape( self.moving_mean, broadcast_shape) broadcast_moving_variance = array_ops.reshape( self.moving_variance, broadcast_shape) return nn.batch_normalization(inputs, broadcast_moving_mean, broadcast_moving_variance, broadcast_beta, broadcast_gamma, self.epsilon) else: return nn.batch_normalization( inputs, self.moving_mean, self.moving_variance, self.beta if self.center else None, self.gamma if self.scale else None, self.epsilon) def normalize_in_training(): if needs_broadcasting: return nn.batch_normalization(inputs, broadcast_mean, broadcast_variance, broadcast_beta, broadcast_gamma, self.epsilon) else: return nn.batch_normalization( inputs, mean, variance, self.beta if self.center else None, self.gamma if self.scale else None, self.epsilon) return utils.smart_cond(training, normalize_in_training, normalize_in_test)
def moving_average_update(variable, value, momentum): return moving_averages.assign_moving_average( variable, value, momentum)
def update_fn(v, value): v.assign_add(value) moving_averages.assign_moving_average(var2, [2.0, 4.0], decay=0.25) moving_averages.assign_moving_average( var3, [2.0, 4.0], decay=0.25, zero_debias=False)
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, moving_vars='moving_vars', activation_fn=None, is_training=True, data_format='NHWC', reuse=None, num_shards=None, distributed_group_size=1, scope=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Note: When is_training is True the moving_mean and moving_variance need to be updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so they need to be added as a dependency to the `train_op`, example: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) total_loss = control_flow_ops.with_dependencies([updates], total_loss) One can set updates_collections=None to force the updates in place, but that can have speed penalty, especially in distributed settings. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. moving_vars: Name of collection created for moving variables. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. data_format: input data format. NHWC or NCHW reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. num_shards: Number of shards that participate in the global reduction. Default is set to None, that will skip the cross replica sum in and normalize across local examples only. distributed_group_size: Number of replicas to normalize across in the distributed batch normalization. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If the rank of `inputs` is neither 2 or 4. ValueError: If rank or `C` dimension of `inputs` is undefined. """ trainable = True with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse): inputs = tf.convert_to_tensor(inputs) original_shape = inputs.get_shape() original_rank = original_shape.ndims if original_rank is None: raise ValueError('Inputs %s has undefined rank' % inputs.name) elif original_rank not in [2, 4]: raise ValueError('Inputs %s has unsupported rank.' ' Expected 2 or 4 but got %d' % (inputs.name, original_rank)) if original_rank == 2: channels = inputs.get_shape()[-1].value if channels is None: raise ValueError('`C` dimension must be known but is None') new_shape = [-1, 1, 1, channels] if data_format == 'NCHW': new_shape = [-1, channels, 1, 1] inputs = tf.reshape(inputs, new_shape) inputs_shape = inputs.get_shape() if data_format == 'NHWC': params_shape = inputs_shape[-1:] else: params_shape = inputs_shape[1:2] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined `C` dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. trainable_beta = trainable and center collections = [tf.GraphKeys.MODEL_VARIABLES, tf.GraphKeys.GLOBAL_VARIABLES] beta = tf.contrib.framework.variable( 'beta', params_shape, collections=collections, initializer=tf.zeros_initializer(), trainable=trainable_beta) trainable_gamma = trainable and scale gamma = tf.contrib.framework.variable( 'gamma', params_shape, collections=collections, initializer=tf.ones_initializer(), trainable=trainable_gamma) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES, tf.GraphKeys.MODEL_VARIABLES, tf.GraphKeys.GLOBAL_VARIABLES] # Disable partition setting for moving_mean and moving_variance # as assign_moving_average op below doesn't support partitioned variable. scope = tf.get_variable_scope() partitioner = scope.partitioner scope.set_partitioner(None) moving_mean = tf.contrib.framework.variable( 'moving_mean', params_shape, initializer=tf.zeros_initializer(), trainable=False, collections=moving_collections) moving_variance = tf.contrib.framework.variable( 'moving_variance', params_shape, initializer=tf.ones_initializer(), trainable=False, collections=moving_collections) # Restore scope's partitioner setting. scope.set_partitioner(partitioner) # Add cross replica sum to do subset mean and variance calculation # First compute mean and variance if is_training: if distributed_group_size > 1: # Execute a distributed batch normalization if data_format == 'NCHW': axis = 1 else: axis = 3 input_shape = inputs.get_shape() inputs_dtype = inputs.dtype inputs = tf.cast(inputs, tf.float32) ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i != axis] counts, mean_ss, variance_ss, _ = tf.nn.sufficient_statistics( inputs, reduction_axes, keep_dims=False) mean_ss = cross_replica_average(mean_ss, num_shards, distributed_group_size) variance_ss = cross_replica_average(variance_ss, num_shards, distributed_group_size) mean, variance = tf.nn.normalize_moments( counts, mean_ss, variance_ss, shift=None) outputs = tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) outputs = tf.cast(outputs, inputs_dtype) else: outputs, mean, variance = tf.nn.fused_batch_norm( inputs, gamma, beta, epsilon=epsilon, data_format=data_format) else: outputs, mean, variance = tf.nn.fused_batch_norm( inputs, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, is_training=False, data_format=data_format) if is_training: update_moving_mean = moving_averages.assign_moving_average( moving_mean, tf.cast(mean, moving_mean.dtype), decay, zero_debias=False) update_moving_variance = moving_averages.assign_moving_average( moving_variance, tf.cast(variance, moving_variance.dtype), decay, zero_debias=False) tf.add_to_collection('update_ops', update_moving_mean) tf.add_to_collection('update_ops', update_moving_variance) outputs.set_shape(inputs_shape) if original_shape.ndims == 2: outputs = tf.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def moving_average_update(variable, value, momentum): try: return moving_averages.assign_moving_average( variable, value, momentum, zero_debias=False) except TypeError: return moving_averages.assign_moving_average(variable, value, momentum)
def __init__( self, prev_layer, n_units=100, act=None, decay=0.9, epsilon=1e-5, is_train=False, bitW=8, bitA=8, gamma_init=tf.compat.v1.initializers.ones, beta_init=tf.compat.v1.initializers.zeros, use_gemm=False, W_init=tf.compat.v1.initializers.truncated_normal(stddev=0.05), W_init_args=None, name=None, #'quan_dense_with_bn', ): super(QuanDenseLayerWithBN, self).__init__(prev_layer=prev_layer, act=act, W_init_args=W_init_args, name=name) logging.info( "QuanDenseLayerWithBN %s: %d %s" % (self.name, n_units, self.act.__name__ if self.act is not None else 'No Activation')) if self.inputs.get_shape().ndims != 2: raise Exception( "The input dimension must be rank 2, please reshape or flatten it" ) if use_gemm: raise Exception( "TODO. The current version use tf.matmul for inferencing.") n_in = int(self.inputs.get_shape()[-1]) x = self.inputs self.inputs = quantize_active_overflow(self.inputs, bitA) self.n_units = n_units with tf.compat.v1.variable_scope(name): W = tf.compat.v1.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, dtype=LayersConfig.tf_dtype, **self.W_init_args) mid_out = tf.matmul(x, W) para_bn_shape = mid_out.get_shape()[-1:] if gamma_init: scale_para = tf.compat.v1.get_variable( name='scale_para', shape=para_bn_shape, initializer=gamma_init, dtype=LayersConfig.tf_dtype, trainable=is_train) else: scale_para = None if beta_init: offset_para = tf.compat.v1.get_variable( name='offset_para', shape=para_bn_shape, initializer=beta_init, dtype=LayersConfig.tf_dtype, trainable=is_train) else: offset_para = None moving_mean = tf.compat.v1.get_variable( 'moving_mean', para_bn_shape, initializer=tf.compat.v1.initializers.constant(1.), dtype=LayersConfig.tf_dtype, trainable=False) moving_variance = tf.compat.v1.get_variable( 'moving_variance', para_bn_shape, initializer=tf.compat.v1.initializers.constant(1.), dtype=LayersConfig.tf_dtype, trainable=False, ) mean, variance = tf.nn.moments( x=mid_out, axes=list(range(len(mid_out.get_shape()) - 1))) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False) # if zero_debias=True, has bias update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) # if zero_debias=True, has bias def mean_var_with_update(): with tf.control_dependencies( [update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance) if is_train: mean, var = mean_var_with_update() else: mean, var = moving_mean, moving_variance w_fold = _w_fold(W, scale_para, var, epsilon) bias_fold = _bias_fold(offset_para, scale_para, mean, var, epsilon) W = quantize_weight_overflow(w_fold, bitW) # W = tl.act.sign(W) # dont update ... # W = tf.Variable(W) self.outputs = tf.matmul(self.inputs, W) # self.outputs = xnor_gemm(self.inputs, W) # TODO self.outputs = tf.nn.bias_add(self.outputs, bias_fold, name='bias_add') self.outputs = self._apply_activation(self.outputs) self._add_layers(self.outputs) self._add_params( [W, scale_para, offset_para, moving_mean, moving_variance])
def batch_norm2d(inputs, is_training=True, eps=1e-05, decay=0.9, affine=True, force_update=False, name=None): """ Do channel-wise batch normalization :param inputs: print(shape1, shape2) :param is_training: bool var indicating mode :param eps: for stabilize :param decay: momentum factor :param affine: whether scale & offset :param name: var_scope & operation name :return: batch_norm output """ with tf.variable_scope(name, default_name='BatchNorm2d'): params_shape = tensor_shape(inputs)[-1:] moving_mean = tf.get_variable('mean', params_shape, initializer=tf.zeros_initializer, trainable=False) moving_variance = tf.get_variable('variance', params_shape, initializer=tf.ones_initializer, trainable=False) # mean_var_with_update is deprecated ! # tf.nn.moments is computing the sample variance, # whereas tf.nn.fused_batch_norm is computing the unbiased variance estimator. # The difference between the two is a factor n/n-1 # def mean_var_with_update(): # # update moving_moments # axes = list(np.arange(len(inputs.get_shape()) - 1)) # mean, variance = tf.nn.moments(inputs, axes, name='moments') # with tf.control_dependencies([assign_moving_average(moving_mean, mean, decay, zero_debias=False), # assign_moving_average(moving_variance, variance, decay, zero_debias=False)]): # # https://stackoverflow.com/questions/34877523/in-tensorflow-what-is-tf-identity-used-for # return tf.identity(mean), tf.identity(variance) if affine: beta = tf.get_variable('beta', params_shape, initializer=tf.zeros_initializer, collections=BN_COLLECTIONS) gamma = tf.get_variable('gamma', params_shape, initializer=tf.ones_initializer, collections=BN_COLLECTIONS) else: gamma = tf.constant(value=np.ones(params_shape, dtype=np.float32)) beta = tf.constant(value=np.zeros(params_shape, dtype=np.float32)) def training_mode(): outputs, batch_mean, batch_var = tf.nn.fused_batch_norm( inputs, gamma, beta, epsilon=eps) return outputs, batch_mean, batch_var def inference_mode(): outputs, batch_mean, batch_var = tf.nn.fused_batch_norm( inputs, gamma, beta, moving_mean, moving_variance, epsilon=eps, is_training=False) return outputs, batch_mean, batch_var outputs, batch_mean, batch_var = tf.cond(tf.constant(is_training), training_mode, inference_mode) if is_training: tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, assign_moving_average(moving_mean, batch_mean, decay, zero_debias=False)) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, assign_moving_average(moving_variance, batch_var, decay, zero_debias=False)) return outputs
def _batch_norm(self, name, x): """Batch normalization.""" with tf.variable_scope(name): params_shape = [x.get_shape()[-1]] beta = tf.get_variable('beta', params_shape, tf.float32, initializer=tf.constant_initializer( 0.0, tf.float32)) gamma = tf.get_variable('gamma', params_shape, tf.float32, initializer=tf.constant_initializer( 1.0, tf.float32)) if self.mode == 'train': # is this valid only for one batch?? mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') moving_mean = tf.get_variable( 'moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) moving_variance = tf.get_variable( 'moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) self._extra_train_ops.append( moving_averages.assign_moving_average( moving_mean, mean, 0.9)) self._extra_train_ops.append( moving_averages.assign_moving_average( moving_variance, variance, 0.9)) # this is 'eval' mode?? else: mean = tf.get_variable('moving_mean', params_shape, tf.float32, initializer=tf.constant_initializer( 0.0, tf.float32), trainable=False) variance = tf.get_variable('moving_variance', params_shape, tf.float32, initializer=tf.constant_initializer( 1.0, tf.float32), trainable=False) tf.summary.histogram(mean.op.name, mean) tf.summary.histogram(variance.op.name, variance) # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net. y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001) y.set_shape(x.get_shape()) return y
def __init__( self, prev_layer, n_filter=32, filter_size=(3, 3), strides=(1, 1), padding='SAME', act=None, decay=0.9, epsilon=1e-5, is_train=False, gamma_init=tf.compat.v1.initializers.ones, beta_init=tf.compat.v1.initializers.zeros, bitW=8, bitA=8, use_gemm=False, W_init=tf.compat.v1.initializers.truncated_normal(stddev=0.02), W_init_args=None, use_cudnn_on_gpu=None, data_format=None, name='quan_cnn2d_bn', ): super(QuanConv2dWithBN, self).__init__(prev_layer=prev_layer, act=act, W_init_args=W_init_args, name=name) logging.info( "QuanConv2dWithBN %s: n_filter: %d filter_size: %s strides: %s pad: %s act: %s " % (self.name, n_filter, filter_size, str(strides), padding, self.act.__name__ if self.act is not None else 'No Activation')) x = self.inputs self.inputs = quantize_active_overflow(self.inputs, bitA) # Do not remove if use_gemm: raise Exception( "TODO. The current version use tf.matmul for inferencing.") if len(strides) != 2: raise ValueError("len(strides) should be 2.") try: pre_channel = int(prev_layer.outputs.get_shape()[-1]) except Exception: # if pre_channel is ?, it happens when using Spatial Transformer Net pre_channel = 1 logging.warning("[warnings] unknow input channels, set to 1") shape = (filter_size[0], filter_size[1], pre_channel, n_filter) strides = (1, strides[0], strides[1], 1) with tf.compat.v1.variable_scope(name): W = tf.compat.v1.get_variable(name='W_conv2d', shape=shape, initializer=W_init, dtype=LayersConfig.tf_dtype, **self.W_init_args) conv = tf.nn.conv2d(x, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) para_bn_shape = conv.get_shape()[-1:] if gamma_init: scale_para = tf.compat.v1.get_variable( name='scale_para', shape=para_bn_shape, initializer=gamma_init, dtype=LayersConfig.tf_dtype, trainable=is_train) else: scale_para = None if beta_init: offset_para = tf.compat.v1.get_variable( name='offset_para', shape=para_bn_shape, initializer=beta_init, dtype=LayersConfig.tf_dtype, trainable=is_train) else: offset_para = None moving_mean = tf.compat.v1.get_variable( 'moving_mean', para_bn_shape, initializer=tf.compat.v1.initializers.constant(1.), dtype=LayersConfig.tf_dtype, trainable=False) moving_variance = tf.compat.v1.get_variable( 'moving_variance', para_bn_shape, initializer=tf.compat.v1.initializers.constant(1.), dtype=LayersConfig.tf_dtype, trainable=False, ) mean, variance = tf.nn.moments( x=conv, axes=list(range(len(conv.get_shape()) - 1))) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False) # if zero_debias=True, has bias update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) # if zero_debias=True, has bias def mean_var_with_update(): with tf.control_dependencies( [update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance) if is_train: mean, var = mean_var_with_update() else: mean, var = moving_mean, moving_variance w_fold = _w_fold(W, scale_para, var, epsilon) bias_fold = _bias_fold(offset_para, scale_para, mean, var, epsilon) W = quantize_weight_overflow(w_fold, bitW) conv_fold = tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) self.outputs = tf.nn.bias_add(conv_fold, bias_fold, name='bn_bias_add') self.outputs = self._apply_activation(self.outputs) self._add_layers(self.outputs) self._add_params( [W, scale_para, offset_para, moving_mean, moving_variance])
def relu(self, inputs, init_x=None): """Construct a relu/relu_x layer on top of cnn.""" if ((not self.params.use_relu_x) or self.params.last_act_name in tf.get_variable_scope().name): return tf.nn.relu(inputs) if self.params.relu_x_per_channel: if self.params.data_format == 'NCHW': inputs = tf.transpose(inputs, [0, 2, 3, 1]) shape = [inputs.get_shape()[3]] reduce_dim = [0, 1, 2] else: shape = [] reduce_dim = None if init_x is None: init_x = self.params.init_relu_x with tf.variable_scope('relu_x'): act = inputs trainable_x = self.params.relu_x_update == 'gradient_descent' x = tf.get_variable('x', shape, tf.float32, initializer=tf.constant_initializer(init_x), trainable=trainable_x) if self.params.relu_x_update == 'moving_average': act = tf.maximum(tf.minimum(inputs, init_x), 0) batch_max = tf.reduce_max(act, axis=reduce_dim, name='BatchMax') x = moving_averages.assign_moving_average(x, tf.cast( batch_max, tf.float32), 0.999, zero_debias=False, name='MovingAvgX') x = tf.cast(x, self.dtype) if self.params.relu_x_per_channel: act = tf.maximum(tf.minimum(act, tf.reshape(x, [1, 1, 1, -1])), 0) else: act = tf.maximum(tf.minimum(act, x), 0) if self.params.quant_act: print('Quantizing activation %s' % act.name) if self.params.relu_x_per_channel: zeros = tf.constant(0, dtype=tf.float32, shape=shape) else: zeros = 0 act = self.delayed_quant( act, zeros, x, per_channel=self.params.relu_x_per_channel, num_bits=self.params.quant_act_bits, narrow_range=False, quant_delay=self.params.quant_act_delay) if self.params.relu_x_per_channel and self.params.data_format == 'NCHW': act = tf.transpose(act, [0, 3, 1, 2]) return act
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, moving_vars='moving_vars', activation=None, is_training=True, trainable=True, restore=True, scope=None, reuse=None): """Adds a Batch Normalization layer. Args: inputs: a tensor of size [batch_size, height, width, channels] or [batch_size, channels]. decay: decay for the moving average. center: If True, subtract beta. If False, beta is not created and ignored. scale: If True, multiply by gamma. If False, gamma is not used. When the next layer is linear (also e.g. ReLU), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. moving_vars: collection to store the moving_mean and moving_variance. activation: activation function. is_training: whether or not the model is in training mode. trainable: whether or not the variables should be trainable or not. restore: whether or not the variables should be marked for restore. scope: Optional scope for variable_scope. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. Returns: a tensor representing the output of the operation. """ inputs_shape = inputs.get_shape() with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse): axis = list(range(len(inputs_shape) - 1)) params_shape = inputs_shape[-1:] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = variables.variable('beta', params_shape, initializer=tf.zeros_initializer(), trainable=trainable, restore=restore) if scale: gamma = variables.variable('gamma', params_shape, initializer=tf.ones_initializer(), trainable=trainable, restore=restore) # Create moving_mean and moving_variance add them to # GraphKeys.MOVING_AVERAGE_VARIABLES collections. moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES] moving_mean = variables.variable('moving_mean', params_shape, initializer=tf.zeros_initializer(), trainable=False, restore=restore, collections=moving_collections) moving_variance = variables.variable('moving_variance', params_shape, initializer=tf.ones_initializer(), trainable=False, restore=restore, collections=moving_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = tf.nn.moments(inputs, axis) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance) else: # Just use the moving_mean and moving_variance. mean = moving_mean variance = moving_variance # Normalize the activations. outputs = tf.nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs.get_shape()) if activation: outputs = activation(outputs) return outputs
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Code modification of tensorflow/contrib/layers/python/layers/layers.py """ with variable_scope.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies( [update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) test_outputs = nn.batch_normalization(inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs = tf.cond(is_training, lambda: outputs, lambda: test_outputs) outputs.set_shape(inputs_shape) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def __init__( self, layer=None, decay=0.9, epsilon=2e-5, act=tf.identity, is_train=False, fix_gamma=True, beta_init=tf.zeros_initializer, gamma_init=tf.random_normal_initializer( mean=1.0, stddev=0.002), # tf.ones_initializer, # dtype = tf.float32, trainable=None, name='batchnorm_layer', ): #Layer.__init__(self, name=name) super(Layer, self).__init__() self.name = name self.inputs = layer.outputs print( " [TL] BatchNormLayer %s: decay:%f epsilon:%f act:%s is_train:%s" % (self.name, decay, epsilon, act.__name__, is_train)) x_shape = self.inputs.get_shape() params_shape = x_shape[-1:] from tensorflow.python.training import moving_averages from tensorflow.python.ops import control_flow_ops with tf.variable_scope(name) as vs: axis = list(range(len(x_shape) - 1)) ## 1. beta, gamma if tf.__version__ > '0.12.1' and beta_init == tf.zeros_initializer: beta_init = beta_init() beta = tf.get_variable('beta', shape=params_shape, initializer=beta_init, dtype=tf.float32, trainable=is_train) #, restore=restore) gamma = tf.get_variable( 'gamma', shape=params_shape, initializer=gamma_init, dtype=tf.float32, trainable=fix_gamma, ) #restore=restore) ## 2. if tf.__version__ > '0.12.1': moving_mean_init = tf.zeros_initializer() else: moving_mean_init = tf.zeros_initializer moving_mean = tf.get_variable( 'moving_mean', params_shape, initializer=moving_mean_init, dtype=tf.float32, trainable=False) # restore=restore) moving_variance = tf.get_variable( 'moving_variance', params_shape, initializer=tf.constant_initializer(1.), dtype=tf.float32, trainable=False, ) # restore=restore) ## 3. # These ops will only be preformed when training. mean, variance = tf.nn.moments(self.inputs, axis) try: # TF12 update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False) # if zero_debias=True, has bias update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) # if zero_debias=True, has bias # print("TF12 moving") except Exception as e: # TF11 update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) # print("TF11 moving") def mean_var_with_update(): with tf.control_dependencies( [update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance) if trainable: mean, var = mean_var_with_update() print(mean) print(var) self.outputs = act( tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon)) else: self.outputs = act( tf.nn.batch_normalization(self.inputs, moving_mean, moving_variance, beta, gamma, epsilon)) variables = [beta, gamma, moving_mean, moving_variance] self.all_layers = list(layer.all_layers) self.all_params = list(layer.all_params) self.all_drop = dict(layer.all_drop) self.all_layers.extend([self.outputs]) self.all_params.extend(variables)
def _do_update(var, value): return moving_averages.assign_moving_average(var, value, self.momentum, zero_debias=False)
def __init__( self, prev_layer, decay=0.9, epsilon=0.00001, act=None, is_train=False, beta_init=tf.zeros_initializer, gamma_init=tf.random_normal_initializer(mean=1.0, stddev=0.002), moving_mean_init=tf.zeros_initializer(), name='batchnorm_layer', ): super(BatchNormLayer, self).__init__(prev_layer=prev_layer, act=act, name=name) logging.info( "BatchNormLayer %s: decay: %f epsilon: %f act: %s is_train: %s" % (self.name, decay, epsilon, self.act.__name__ if self.act is not None else 'No Activation', is_train) ) x_shape = self.inputs.get_shape() params_shape = x_shape[-1:] with tf.variable_scope(name): axis = list(range(len(x_shape) - 1)) # 1. beta, gamma variables = [] if beta_init: if beta_init == tf.zeros_initializer: beta_init = beta_init() beta = tf.get_variable( 'beta', shape=params_shape, initializer=beta_init, dtype=LayersConfig.tf_dtype, trainable=is_train ) variables.append(beta) else: beta = None if gamma_init: gamma = tf.get_variable( 'gamma', shape=params_shape, initializer=gamma_init, dtype=LayersConfig.tf_dtype, trainable=is_train, ) variables.append(gamma) else: gamma = None # 2. moving_mean = tf.get_variable( 'moving_mean', params_shape, initializer=moving_mean_init, dtype=LayersConfig.tf_dtype, trainable=False ) moving_variance = tf.get_variable( 'moving_variance', params_shape, initializer=tf.constant_initializer(1.), dtype=LayersConfig.tf_dtype, trainable=False, ) # 3. # These ops will only be preformed when training. mean, variance = tf.nn.moments(self.inputs, axis) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False ) # if zero_debias=True, has bias update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False ) # if zero_debias=True, has bias def mean_var_with_update(): with tf.control_dependencies([update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance) if is_train: mean, var = mean_var_with_update() else: mean, var = moving_mean, moving_variance self.outputs = self._apply_activation( tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon) ) variables.extend([moving_mean, moving_variance]) self._add_layers(self.outputs) self._add_params(variables)
def batch_norm_layer(self, x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None): """ Performs a batch normalization layer Args: x: input tensor scope: scope name is_training: python boolean value epsilon: the variance epsilon - a small float number to avoid dividing by 0 decay: the moving average decay Returns: The ops of a batch normalization layer """ with tf.variable_scope(scope, reuse=reuse): shape = x.get_shape().as_list() # gamma: a trainable scale factor gamma = tf.get_variable("gamma", shape[-1:], initializer=tf.constant_initializer(1.0), trainable=True) # beta: a trainable shift value beta = tf.get_variable("beta", 1, initializer=tf.constant_initializer(0.0), trainable=True) moving_avg = tf.get_variable( "moving_avg", shape[-1:], initializer=tf.constant_initializer(0.0), trainable=False) moving_var = tf.get_variable( "moving_var", shape[-1:], initializer=tf.constant_initializer(1.0), trainable=False) if is_training: # tf.nn.moments == Calculate the mean and the variance of the tensor x avg, var = tf.nn.moments(x, list(range(len(shape) - 1))) avg = tf.cast(avg, tf.float32) var = tf.cast(var, tf.float32) update_moving_avg = moving_averages.assign_moving_average( moving_avg, avg, decay) update_moving_var = moving_averages.assign_moving_average( moving_var, var, decay) control_inputs = [update_moving_avg, update_moving_var] else: avg = moving_avg var = moving_var control_inputs = [] with tf.control_dependencies(control_inputs): output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon) return output
def weighted_resample(inputs, weights, overall_rate, scope=None, mean_decay=0.999, warmup=10, seed=None): """Performs an approximate weighted resampling of `inputs`. This method chooses elements from `inputs` where each item's rate of selection is proportional to its value in `weights`, and the average rate of selection across all inputs (and many invocations!) is `overall_rate`. Args: inputs: A list of tensors whose first dimension is `batch_size`. weights: A `[batch_size]`-shaped tensor with each batch member's weight. overall_rate: Desired overall rate of resampling. scope: Scope to use for the op. mean_decay: How quickly to decay the running estimate of the mean weight. warmup: Until the resulting tensor has been evaluated `warmup` times, the resampling menthod uses the true mean over all calls as its weight estimate, rather than a decayed mean. seed: Random seed. Returns: A list of tensors exactly like `inputs`, but with an unknown (and possibly zero) first dimension. A tensor containing the effective resampling rate used for each output. """ # Algorithm: Just compute rates as weights/mean_weight * # overall_rate. This way the the average weight corresponds to the # overall rate, and a weight twice the average has twice the rate, # etc. with ops.name_scope(scope, 'weighted_resample', inputs) as opscope: # First: Maintain a running estimated mean weight, with decay # adjusted (by also maintaining an invocation count) during the # warmup period so that at the beginning, there aren't too many # zeros mixed in, throwing the average off. with variable_scope.variable_scope(scope, 'estimate_mean', inputs): count_so_far = variable_scope.get_local_variable('resample_count', initializer=0) estimated_mean = variable_scope.get_local_variable( 'estimated_mean', initializer=0.0) count = count_so_far.assign_add(1) real_decay = math_ops.minimum( math_ops.truediv((count - 1), math_ops.minimum(count, warmup)), mean_decay) batch_mean = math_ops.reduce_mean(weights) mean = moving_averages.assign_moving_average( estimated_mean, batch_mean, real_decay) # Then, normalize the weights into rates using the mean weight and # overall target rate: rates = weights * overall_rate / mean results = resample_at_rate([rates] + inputs, rates, scope=opscope, seed=seed, back_prop=False) return (results[1:], results[0])
def _build(self, inputs, is_training): """Connects the module to some inputs. Args: inputs: Tensor, final dimension must be equal to embedding_dim. All other leading dimensions will be flattened and treated as a large batch. is_training: boolean, whether this connection is to training data. When this is set to False, the internal moving average statistics will not be updated. Returns: dict containing the following keys and values: quantize: Tensor containing the quantized version of the input. loss: Tensor containing the loss to optimize. perplexity: Tensor containing the perplexity of the encodings. encodings: Tensor containing the discrete encodings, ie which element of the quantized space each input element was mapped to. encoding_indices: Tensor containing the discrete encoding indices, ie which element of the quantized space each input element was mapped to. """ # Ensure that the weights are read fresh for each timestep, which otherwise # would not be guaranteed in an RNN setup. Note that this relies on inputs # having a data dependency with the output of the previous timestep - if # this is not the case, there is no way to serialize the order of weight # updates within the module, so explicit external dependencies must be used. with tf.control_dependencies([inputs]): w = self._w.read_value() input_shape = tf.shape(inputs) with tf.control_dependencies([ tf.Assert(tf.equal(input_shape[-1], self._embedding_dim), [input_shape]) ]): flat_inputs = tf.reshape(inputs, [-1, self._embedding_dim]) distances = (tf.reduce_sum(flat_inputs**2, 1, keepdims=True) - 2 * tf.matmul(flat_inputs, w) + tf.reduce_sum(w**2, 0, keepdims=True)) encoding_indices = tf.argmax(-distances, 1) encodings = tf.one_hot(encoding_indices, self._num_embeddings) encoding_indices = tf.reshape(encoding_indices, tf.shape(inputs)[:-1]) quantized = self.quantize(encoding_indices) e_latent_loss = tf.reduce_mean( (tf.stop_gradient(quantized) - inputs)**2) if is_training: updated_ema_cluster_size = moving_averages.assign_moving_average( self._ema_cluster_size, tf.reduce_sum(encodings, 0), self._decay) dw = tf.matmul(flat_inputs, encodings, transpose_a=True) updated_ema_w = moving_averages.assign_moving_average( self._ema_w, dw, self._decay) n = tf.reduce_sum(updated_ema_cluster_size) updated_ema_cluster_size = ( (updated_ema_cluster_size + self._epsilon) / (n + self._num_embeddings * self._epsilon) * n) normalised_updated_ema_w = ( updated_ema_w / tf.reshape(updated_ema_cluster_size, [1, -1])) with tf.control_dependencies([e_latent_loss]): update_w = tf.assign(self._w, normalised_updated_ema_w) with tf.control_dependencies([update_w]): loss = self._commitment_cost * e_latent_loss else: loss = self._commitment_cost * e_latent_loss quantized = inputs + tf.stop_gradient(quantized - inputs) avg_probs = tf.reduce_mean(encodings, 0) perplexity = tf.exp(-tf.reduce_sum(avg_probs * tf.log(avg_probs + 1e-10))) return { 'quantize': quantized, 'loss': loss, 'perplexity': perplexity, 'encodings': encodings, 'encoding_indices': encoding_indices, }
def layer_op(self, inputs, is_training, use_local_stats=False): input_shape = inputs.shape # operates on all dims except the last dim params_shape = input_shape[-1:] axes = list(range(input_shape.ndims - 1)) # create trainable variables and moving average variables beta = tf.get_variable('beta', shape=params_shape, initializer=self.initializers['beta'], regularizer=self.regularizers['beta'], dtype=tf.float32, trainable=True) gamma = tf.get_variable('gamma', shape=params_shape, initializer=self.initializers['gamma'], regularizer=self.regularizers['gamma'], dtype=tf.float32, trainable=True) collections = [tf.GraphKeys.GLOBAL_VARIABLES] moving_mean = tf.get_variable( 'moving_mean', shape=params_shape, initializer=self.initializers['moving_mean'], dtype=tf.float32, trainable=False, collections=collections) moving_variance = tf.get_variable( 'moving_variance', shape=params_shape, initializer=self.initializers['moving_variance'], dtype=tf.float32, trainable=False, collections=collections) # mean and var mean, variance = tf.nn.moments(inputs, axes) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, self.moving_decay).op update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, self.moving_decay).op tf.add_to_collection(BN_COLLECTION, update_moving_mean) tf.add_to_collection(BN_COLLECTION, update_moving_variance) # call the normalisation function if is_training or use_local_stats: # with tf.control_dependencies( # [update_moving_mean, update_moving_variance]): outputs = tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, self.eps, name='batch_norm') else: outputs = tf.nn.batch_normalization(inputs, moving_mean, moving_variance, beta, gamma, self.eps, name='batch_norm') outputs.set_shape(inputs.get_shape()) return outputs
def build_candidate(self, ensemble_spec, training, iteration_step, summary, is_previous_best=False, track_moving_average=True): """Builds and returns an AdaNet candidate. Args: ensemble_spec: `_EnsembleSpec` instance to track. training: A python boolean indicating whether the graph is in training mode or prediction mode. iteration_step: Integer `Tensor` representing the step since the beginning of the current iteration, as opposed to the global step. summary: A `Summary` for recording summaries for TensorBoard. is_previous_best: Bool identifying whether this ensemble came from a previous iteration. If `True`, `is_training` will be `False` since its weights are frozen. track_moving_average: Bool whether to track the moving average of the ensemble's adanet loss. Returns: A _Candidate instance. """ candidate_scope = "candidate_{}".format(ensemble_spec.name) with tf_compat.v1.variable_scope(candidate_scope): adanet_loss = ensemble_spec.adanet_loss if track_moving_average: adanet_loss = tf_compat.v1.get_variable("adanet_loss", initializer=0., trainable=False) if is_previous_best: # This candidate is frozen, so it is already done training. is_training = tf.constant(False, name="is_training") elif self._max_steps is not None: # Train this candidate for `max_steps` steps. # NOTE: During training, the iteration step gets incremented at the very # end of the computation graph, so we need to account for that here. is_training = tf.less(iteration_step + 1 if training else 0, self._max_steps, name="is_training") else: # Train this candidate forever. is_training = tf.constant(True, name="is_training") if training and track_moving_average: update_adanet_loss_op = moving_averages.assign_moving_average( adanet_loss, ensemble_spec.adanet_loss, decay=self._adanet_loss_decay) with tf.control_dependencies([update_adanet_loss_op]): adanet_loss = adanet_loss.read_value() with summary.current_scope(): summary.scalar( "adanet_loss/adanet/adanet_weighted_ensemble", adanet_loss) return _Candidate(ensemble_spec=ensemble_spec, adanet_loss=adanet_loss, is_training=is_training, is_previous_best=is_previous_best)
def mean_var_with_update(): axes = list(range(len(x.get_shape()) - 1)) batch_mean, batch_var = tf.nn.moments(x, axes=axes, name='moments') with tf.control_dependencies([assign_moving_average(pop_mean, batch_mean, decay), assign_moving_average(pop_var, batch_var, decay)]): return tf.identity(batch_mean), tf.identity(batch_var)
def add_moving_summary(*args, **kwargs): """ Summarize the moving average for scalar tensors. This function is a no-op if not calling from main training tower. Args: args: scalar tensors to summarize decay (float): the decay rate. Defaults to 0.95. collection (str or None): the name of the collection to add EMA-maintaining ops. The default will work together with the default :class:`MovingAverageSummary` callback. summary_collections ([str]): the names of collections to add the summary op. Default is TF's default (`tf.GraphKeys.SUMMARIES`). Returns: [tf.Tensor]: list of tensors returned by assign_moving_average, which can be used to maintain the EMA. """ decay = kwargs.pop('decay', 0.95) coll = kwargs.pop('collection', MOVING_SUMMARY_OPS_KEY) summ_coll = kwargs.pop('summary_collections', None) assert len(kwargs) == 0, "Unknown arguments: " + str(kwargs) ctx = get_current_tower_context() # allow ctx to be none if ctx is not None and not ctx.is_main_training_tower: return [] if tf.get_variable_scope().reuse is True: logger.warn( "add_moving_summary() called under reuse=True scope, ignored.") return [] for x in args: assert isinstance(x, (tf.Tensor, tf.Variable)), x assert x.get_shape().ndims == 0, \ "add_moving_summary() only accepts scalar tensor! Got one with {}".format(x.get_shape()) # TODO variable not saved under distributed ema_ops = [] for c in args: name = re.sub('tower[0-9]+/', '', c.op.name) with tf.name_scope(None): if not c.dtype.is_floating: c = tf.cast(c, tf.float32) # assign_moving_average creates variables with op names, therefore clear ns first. with _enter_vs_reuse_ns('EMA') as vs: ema_var = tf.get_variable( name, shape=c.shape, dtype=c.dtype, initializer=tf.constant_initializer(), trainable=False) ns = vs.original_name_scope with tf.name_scope(ns): # reuse VS&NS so that EMA_1 won't appear ema_op = moving_averages.assign_moving_average( ema_var, c, decay, zero_debias=True, name=name + '_EMA_apply') ema_ops.append(ema_op) with tf.name_scope(None): tf.summary.scalar( name + '-summary', ema_op, collections=summ_coll) # write the EMA value as a summary if coll is not None: for op in ema_ops: tf.add_to_collection(coll, op) return ema_ops
def _quantizable_concat(self, inputs, axis, is_training, is_quantized=True, default_min=0, default_max=6, ema_decay=0.999, scope='quantized_concat'): """Concat replacement with quantization option. Allows concat inputs to share the same min max ranges, from experimental/gazelle/synthetic/model/tpu/utils.py. Args: inputs: list of tensors to concatenate. axis: dimension along which to concatenate. is_training: true if the graph is a training graph. is_quantized: flag to enable/disable quantization. default_min: default min value for fake quant op. default_max: default max value for fake quant op. ema_decay: the moving average decay for the quantization variables. scope: Optional scope for variable_scope. Returns: Tensor resulting from concatenation of input tensors """ if is_quantized: with tf.variable_scope(scope): min_var = self._quant_var('min', default_min) max_var = self._quant_var('max', default_max) if not is_training: # If we are building an eval graph just use the values in the # variables. quant_inputs = [ tf.fake_quant_with_min_max_vars(t, min_var, max_var) for t in inputs ] else: concat_tensors = tf.concat(inputs, axis=axis) tf.logging.info( 'concat_tensors: {}'.format(concat_tensors)) # TFLite requires that 0.0 is always in the [min; max] range. range_min = tf.minimum(tf.reduce_min(concat_tensors), 0.0, name='SafeQuantRangeMin') range_max = tf.maximum(tf.reduce_max(concat_tensors), 0.0, name='SafeQuantRangeMax') # Otherwise we need to keep track of the moving averages of the min # and of the elements of the input tensor max. min_val = moving_averages.assign_moving_average( min_var, range_min, ema_decay, name='AssignMinEma') max_val = moving_averages.assign_moving_average( max_var, range_max, ema_decay, name='AssignMaxEma') quant_inputs = [ tf.fake_quant_with_min_max_vars(t, min_val, max_val) for t in inputs ] outputs = tf.concat(quant_inputs, axis=axis) else: outputs = tf.concat(inputs, axis=axis) return outputs
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, scope=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor of size `[batch_size, height, width, channels]` or `[batch_size, channels]`. decay: decay for the moving average. center: If True, subtract `beta`. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. activation_fn: Optional activation function. updates_collections: collections to collect the update ops for computation. If None, a control dependency would be added to make sure the updates are computed. is_training: whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse) as sc: inputs_shape = inputs.get_shape() dtype = inputs.dtype.base_dtype axis = list(range(len(inputs_shape) - 1)) params_shape = inputs_shape[-1:] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies( [update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: outputs = nn.batch_normalization(inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs.set_shape(inputs.get_shape()) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def myBatchNorm(self, x): epsilion = 0.001 decay = 0.9 with tf.variable_scope('batchNorm'): paramsShape = x.get_shape().dims[self.channelIndex] gamma = tf.get_variable('gamma', paramsShape, tf.float32, initializer=tf.constant_initializer( 1.0, tf.float32)) beta = tf.get_variable('beta', paramsShape, tf.float32, initializer=tf.constant_initializer( 0.0, tf.float32)) if self.isTraining: [y, mean, variance ] = tf.nn.fused_batch_norm(x, gamma, beta, data_format=self.dataFormat, epsilon=epsilion) movingMean = tf.get_variable( 'movingMean', paramsShape, tf.float32, initializer=tf.constant_initializer(0.0, tf.float32), trainable=False) movingVariance = tf.get_variable( 'movingVariance', paramsShape, tf.float32, initializer=tf.constant_initializer(1.0, tf.float32), trainable=False) #To adjust l2decay for the numerical stability of weight(?), and now with BN there's maybe also a demand for this tf.summary.histogram('movingMean', movingMean) tf.summary.histogram('movingVariance', movingVariance) self.extraTrainOps.append( moving_averages.assign_moving_average( movingMean, mean, decay)) self.extraTrainOps.append( moving_averages.assign_moving_average( movingVariance, variance, decay)) else: mean = tf.get_variable('movingMean', paramsShape, tf.float32, initializer=tf.constant_initializer( 0.0, tf.float32), trainable=False) variance = tf.get_variable('movingVariance', paramsShape, tf.float32, initializer=tf.constant_initializer( 1.0, tf.float32), trainable=False) [y, _, _] = tf.nn.fused_batch_norm(x, gamma, beta, mean=mean, variance=variance, epsilon=epsilion, data_format=self.dataFormat, is_training=self.isTraining) return y
def discrete_bottleneck(self, x): """Discretization bottleneck for latent variables. Args: x: Input to the discretization bottleneck. Returns: Embedding to pass to the decoder, discrete latent, loss, and the embedding function. Raises: ValueError: If projection_tensors is None for reshape_method project, or ema_count or ema_means is None if we are using ema, or unknown args. """ x_reshaped = self.slice_hidden(x) x_means_hot = [] x_means = 0 loss = 0 x_means_hot, x_means, q_loss, e_loss = self.embedding_lookup( x_reshaped, self.means) if self.hparams.ema: tf.logging.info("Using EMA with beta = {}".format( self.hparams.beta)) updated_ema_count = \ moving_averages.assign_moving_average( self.ema_count, tf.reduce_sum( tf.reshape( x_means_hot, shape=[-1, self.hparams.num_blocks, self.hparams.block_v_size]), axis=0), self.hparams.decay, zero_debias=False) dw = tf.matmul(tf.transpose(x_means_hot, perm=[1, 2, 0]), tf.transpose(x_reshaped, perm=[1, 0, 2])) updated_ema_means = \ moving_averages.assign_moving_average( self.ema_means, dw, self.hparams.decay, zero_debias=False) n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True) updated_ema_count = ( (updated_ema_count + self.hparams.epsilon) / (n + 2**self.hparams.z_size * self.hparams.epsilon) * n) updated_ema_means = updated_ema_means / tf.expand_dims( updated_ema_count, axis=-1) with tf.control_dependencies([e_loss]): update_means = tf.assign(self.means, updated_ema_means) with tf.control_dependencies([update_means]): loss += self.hparams.beta * e_loss else: # Use a gradient based loss for learning the cluster centers loss += q_loss + self.hparams.beta * e_loss # Get the discrete latent representation x_means_idx = tf.argmax(x_means_hot, axis=-1) # Get the binary representation num_bits = int(self.hparams.z_size // self.hparams.num_blocks) x_means_bits = self.int_to_bit(x_means_idx, num_bits=num_bits, base=2) x_discrete = self.bit_to_int(tf.to_int32(x_means_bits), num_bits=self.hparams.z_size, base=2) # Reshape x_discrete shape_x = shape_list(x) shape_discrete = shape_x[:-1] x_discrete = tf.reshape(x_discrete, shape_discrete) x_means = tf.reshape(x_means, shape=shape_x) h1 = x + tf.stop_gradient(x_means - x) h2 = tf.layers.dense(tf.nn.relu(h1), self.hparams.filter_size, name="vch2") res = tf.layers.dense(tf.nn.relu(h2), self.hparams.hidden_size, name="vcfin") embed_fn = partial(self.embed) return { "dense": res, "discrete": x_discrete, "loss": loss, "embed": embed_fn }
def call(self, inputs, training=False): # First, compute the axes along which to reduce the mean / variance, # as well as the broadcast shape to be used for all parameters. input_shape = inputs.get_shape() ndim = len(input_shape) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis].value # Determines whether broadcasting is needed. needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1]) scale, offset = self.gamma, self.beta # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if training_value is not False: # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. mean, variance = nn.moments(inputs, reduction_axes) mean = _smart_select(training, lambda: mean, lambda: self.moving_mean) variance = _smart_select(training, lambda: variance, lambda: self.moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. scale = array_ops.stop_gradient(r, name='renorm_r') offset = array_ops.stop_gradient(d, name='renorm_d') if self.gamma is not None: scale *= self.gamma offset *= self.gamma if self.beta is not None: offset += self.beta else: new_mean, new_variance = mean, variance # Update moving averages when training, and prevent updates otherwise. decay = _smart_select(training, lambda: self.momentum, lambda: 1.) mean_update = moving_averages.assign_moving_average( self.moving_mean, new_mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, new_variance, decay, zero_debias=False) if not self.updates: # In the future this should be refactored into a self.add_update # methods in order to allow for instance-based BN layer sharing # across unrelated input streams (e.g. like in Keras). self.updates.append(mean_update) self.updates.append(variance_update) else: mean, variance = self.moving_mean, self.moving_variance def _broadcast(v): if needs_broadcasting and v is not None: # In this case we must explictly broadcast all parameters. return array_ops.reshape(v, broadcast_shape) return v return nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), _broadcast(offset), _broadcast(scale), self.epsilon)