def testRerouteTensor(self): a = constant_op.constant(1, name='a') b = constant_op.constant(2, name='b') c = constant_op.constant(3, name='c') d = constant_op.constant(4, name='d') add_ac = math_ops.add(a, c) add_ad = math_ops.add(a, d) # Ensure that before rerouting the inputs are what we think. self._CheckOpHasInputs(add_ac.op, [a, c]) self._CheckOpHasInputs(add_ad.op, [a, d]) # references to tensor a should be replaced with b for all ops in # can_modify. This means add_ac will be changed but add_ad will not. common.RerouteTensor(b, a, can_modify=[add_ac.op]) self._CheckOpHasInputs(add_ac.op, [b, c]) self._CheckOpHasInputs(add_ad.op, [a, d])
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ for match in _FindFusedBatchNorms(graph): scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract(match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context='', match=match, freeze_batch_norm_delay=freeze_batch_norm_delay)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. weights = match.weight_tensor if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape(multiplier_tensor, new_shape, name='scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape( correction_scale, new_shape, name='correction_reshape') if correction_scale is not None: weights = math_ops.multiply(correction_scale, weights, name='correction_mult') scaled_weight_tensor = math_ops.multiply(weights, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands(match.layer_op, match.input_tensor, scaled_weight_tensor, match.batch_to_space_op) if correction_recip is not None: new_layer_tensor = math_ops.multiply(correction_recip, new_layer_tensor, name='post_conv_mul') new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), 'correction_add') bias_add_tensor = math_ops.add(new_layer_tensor, bias_tensor, name='add_fold') nodes_modified_count = common.RerouteTensor( bias_add_tensor, match.output_tensor) if nodes_modified_count == 0: raise ValueError( 'Folding batch norms failed, %s had no outputs.' % match.output_tensor.name)
def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds unfused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, True if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ input_to_ops_map = input_to_ops.InputToOps(graph) for bn in common.BatchNormGroups(graph): has_scaling = _HasScaling(graph, input_to_ops_map, bn) if not _IsValidUnfusedBatchNorm(graph, bn): continue # The mangling code intimately depends on BatchNorm node's internals. original_op, folded_op = _CreateFoldedOp( graph, bn, has_scaling=has_scaling, freeze_batch_norm_delay=freeze_batch_norm_delay, is_training=is_training) activation = common.GetEndpointActivationOp(graph, bn) if activation: nodes_modified_count = common.RerouteTensor( folded_op.outputs[0], original_op.outputs[0], can_modify=[activation]) if nodes_modified_count != 1: raise ValueError('Unexpected inputs to op: %s' % activation.name) continue # Treat consumer ops in bypass modules differently since they have Add # operations instead of Relu* above. # Changes to make sure that the correct scope is selected for the bypass add # The rule here is that if the scope is of the form: str1/str2 for the # batch norm, # the bypass add is at scope str1. If bn is of scope just str1, then the # bypass add is at scope ''. # If there is no batch norm, then there is no bypass add. add_bypass_ctx = '' if bn: try: add_bypass_ctx = re.search(r'^(.*)/([^/]+)', bn).group(1) except AttributeError: add_bypass_ctx = '' if add_bypass_ctx: add_bypass_ctx = add_bypass_ctx + '/' add_bypass = graph.get_operation_by_name(add_bypass_ctx + 'AddV2') nodes_modified_count = common.RerouteTensor(folded_op.outputs[0], original_op.outputs[0], can_modify=[add_bypass]) if nodes_modified_count != 1: raise ValueError('Unexpected inputs to op: %s' % add_bypass.name)
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay): """Computes batch norm correction params. Before batch normalization is frozen: We use batch statistics for batch norm. correction_scale = sigma_b/sigma_mv correction_recip = 1/correction_scale correction_offset = 0 After batch normalization is frozen: correction_scale = sigma_b/sigma_mv correction_recip = 1 correction_offset = gamma*(mu_b/sigma_b-mu_mv/sigma_mv). Batch norm is frozen if global_step > bn_freeze_delay. The corrections ensure that: a) The weights are quantized after scaling by gamma/sigma_mv. This enables smoother training as the scaling on the weights changes slowly, rather than jump across mini-batches b) Changing the values of the corrections allows for one to switch between using batch statistics to using moving mean and average, without requiring changes to batch_norm Args: context: The scope under which we look for batch norm params match: Object containing required batch norm tensors for correction computation. freeze_batch_norm_delay: Delay in steps at which computation switches from regular batch norm to frozen mean and variance. Returns: A tuple of correction_scale, correction_recip, correction_offset """ g = ops.get_default_graph() prefix = '' if not context else context with g.name_scope(prefix + 'batch_norm_correction'): recip_sigma_mv = math_ops.rsqrt(match.moving_variance_tensor + match.batch_epsilon) recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon) correction_scale = math_ops.divide(recip_sigma_mv, recip_sigma, name='scale_compute') correction_scale = array_ops.identity(correction_scale, name='correction_scale') correction_recip = math_ops.reciprocal(correction_scale, name='reciprocal_compute') correction_offset = math_ops.multiply( match.gamma_tensor, match.mean_tensor * recip_sigma - match.moving_mean_tensor * recip_sigma_mv, name='offset_compute') if freeze_batch_norm_delay is not None: use_mv_avg = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), freeze_batch_norm_delay, name='use_moving_average') else: use_mv_avg = False bn_decay_zero = 0.0 bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_mean_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_mean_tensor, name='freeze_moving_mean') common.RerouteTensor(bn_decay_mean_out, match.bn_decay_mean_tensor, can_modify=bn_decay_mean_consumers) bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) bn_decay_var_out = utils.smart_cond(use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_var_tensor, name='freeze_moving_var') common.RerouteTensor(bn_decay_var_out, match.bn_decay_var_tensor, can_modify=bn_decay_var_consumers) correction_recip = utils.smart_cond( use_mv_avg, lambda: array_ops.ones(correction_scale.shape), lambda: correction_recip, name='correction_recip') correction_offset = utils.smart_cond( use_mv_avg, lambda: correction_offset, lambda: array_ops.zeros(correction_offset.shape), name='correction_offset') return correction_scale, correction_recip, correction_offset
def _InsertQuantOp(context, name, producer, consumers, is_training, moving_avg=True, init_min=-6.0, init_max=6.0, bits=8, symmetric=False, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, narrow_range=False, producer_scope=None, consumer_scope=None): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new quantization op within the context. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. is_training: Whether quantizing training graph or eval graph. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. bits: Number of bits to use for quantization, must be between 2 and 8. symmetric: (Optional) If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. producer_scope: The restriction of producer scope. If not None, the new op will be inserted only when the producer is in this scope. consumer_scope: The restriction of consumer scope. If not None, the new op will be inserted only when all the consumers are in this scope. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ if producer_scope and not producer.name.startswith(producer_scope): logging.info( '_InsertQuantOp ignores context="%s" name="%s" ' 'because producer "%s" is not in scope "%s"', context, name, producer.name, producer_scope) return if consumer_scope: consumers_in_scope = [] for consumer in consumers: if consumer.name.startswith(consumer_scope): consumers_in_scope.append(consumer) else: logging.info( '_InsertQuantOp context="%s" name="%s" ignores ' 'consumer "%s" because it is not in scope "%s"', context, name, consumer.name, consumer_scope) return consumers = consumers_in_scope name_prefix = _AddContextToName(context, name) # This is needed on TPU where name_scope == 'TPUReplicate/loop', and # name_prefix starts with 'TPUReplicate/loop/'; without dropping it # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which # breaks things later. name_scope = ops.get_name_scope() if name_scope: name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/') inputs = producer.outputs[0] # Prevent ops from being quantized multiple times. Bypass ops can sometimes # overlap between multiple matches, so we need to ensure that we don't # add duplicate FakeQuant operations. if _FollowedByFakeQuant(inputs): return if moving_avg: quant = (quant_ops.MovingAvgQuantize(inputs, init_min=init_min, init_max=init_max, ema_decay=ema_decay, is_training=is_training, num_bits=bits, symmetric=symmetric, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) else: quant = (quant_ops.LastValueQuantize(inputs, init_min=init_min, init_max=init_max, is_training=is_training, num_bits=bits, symmetric=symmetric, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) if quant_delay and quant_delay > 0: activate_quant = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), quant_delay, name=name_prefix + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=name_prefix + '/delayed_quant') if consumers: tensors_modified_count = common.RerouteTensor(quant, inputs, can_modify=consumers) # Some operations can have multiple output tensors going to the same # consumer. Since consumers is a set, we need to ensure that # tensors_modified_count is greater than or equal to the length of the set # of consumers. if tensors_modified_count < len(consumers): raise ValueError( 'No inputs quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))