def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor, bottleneck_tensor_size, quantize_layer): with tf.name_scope('input'): bottleneck_input = tf.placeholder_with_default( bottleneck_tensor, shape=[None, bottleneck_tensor_size], name='BottleneckInputPlaceholder') ground_truth_input = tf.placeholder(tf.int64, [None], name='GroundTruthInput') layer_name = 'final_training_ops' with tf.name_scope(layer_name): quantized_layer_weights = None quantized_layer_biases = None with tf.name_scope('weights'): initial_value = tf.truncated_normal( [bottleneck_tensor_size, class_count], stddev=0.001) layer_weights = tf.Variable(initial_value, name='final_weights') if quantize_layer: quantized_layer_weights = quant_ops.MovingAvgQuantize( layer_weights, is_training=True) attachTensorBoardSummaries(quantized_layer_weights) attachTensorBoardSummaries(layer_weights) with tf.name_scope('biases'): layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases') if quantize_layer: quantized_layer_biases = quant_ops.MovingAvgQuantize( layer_biases, is_training=True) attachTensorBoardSummaries(quantized_layer_biases) attachTensorBoardSummaries(layer_biases) with tf.name_scope('Wx_plus_b'): if quantize_layer: logits = tf.matmul( bottleneck_input, quantized_layer_weights) + quantized_layer_biases logits = quant_ops.MovingAvgQuantize(logits, init_min=-32.0, init_max=32.0, is_training=True, num_bits=8, narrow_range=False, ema_decay=0.5) tf.summary.histogram('pre_activations', logits) else: logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases tf.summary.histogram('pre_activations', logits) final_tensor = tf.nn.softmax(logits, name=final_tensor_name) tf.summary.histogram('activations', final_tensor) with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'): optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE) train_step = optimizer.minimize(cross_entropy_mean) return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input, final_tensor)
def insert_quant_op(graph, node_name, is_train): """Insert quantization operations to the specified activation node. Args: * graph: TensorFlow graph * node_name: activation node's name * is_train: insert training-related operations or not """ # locate the node & activation operation for op in graph.get_operations(): if node_name in [node.name for node in op.outputs]: tf.logging.info('op: {} / inputs: {} / outputs: {}'.format( op.name, [node.name for node in op.inputs], [node.name for node in op.outputs])) node = op.outputs[0] activation_op = op break # re-route the graph to insert quantization operations input_to_ops_map = input_to_ops.InputToOps(graph) consumer_ops = input_to_ops_map.ConsumerOperations(activation_op) node_quant = quant_ops.MovingAvgQuantize( node, is_training=is_train, num_bits=FLAGS.uqtf_activation_bits) nb_update_inputs = common.RerouteTensor(node_quant, node, consumer_ops) tf.logging.info('nb_update_inputs = %d' % nb_update_inputs)
def inference_quant(self, input_tensor_name_list, input_data_list, trace_tensor_name_list): #tf.import_graph_def(self._graph_def, name='') with tf.Session() as sess: # insert quant ops quant_bits = 8 is_training = True target_weight_tensor_name = "CifarNet/conv1/weights:0" target_weight_tensor = sess.graph.get_tensor_by_name( target_weight_tensor_name) target_weight_tensor_quant = quant_ops.LastValueQuantize( target_weight_tensor, is_training=is_training, narrow_range=True, num_bits=quant_bits, name_prefix=target_weight_tensor_name + "Weights") target_input_tensor_name = "Placeholder:0" target_input_tensor = sess.graph.get_tensor_by_name( target_input_tensor_name) target_input_tensor_quant = quant_ops.MovingAvgQuantize( target_input_tensor, is_training=is_training, narrow_range=True, num_bits=quant_bits, name_prefix="Input") init = tf.global_variables_initializer() sess.run(init) trace_tensor_list = [] for tensor_name in trace_tensor_name_list: trace_tensor_list.append( sess.graph.get_tensor_by_name(tensor_name)) input_tensor_list = [] for tensor_name in input_tensor_name_list: input_tensor_list.append( sess.graph.get_tensor_by_name(tensor_name)) feed_dict = {} for input_tensor, input_data in zip(input_tensor_list, input_data_list): feed_dict[input_tensor] = input_data trace_tensor_list.append(target_weight_tensor) trace_tensor_list.append(target_weight_tensor_quant) trace_tensor_list.append(target_input_tensor) trace_tensor_list.append(target_input_tensor_quant) outputs = sess.run(trace_tensor_list, feed_dict=feed_dict) if len(outputs) != len(trace_tensor_list): print("inference error") assert (0) for tensor, data in zip(trace_tensor_list, outputs): print("%s\n%r" % (tensor, data)) return outputs
def testVariablesNotPartitioned_MovingAvg(self): # Variables added should not use a default partiioner since they are # scalar. There would be a tensorflow error thrown if the partitioner was # respected by the rewrite. with ops.Graph().as_default(): with variable_scope.variable_scope( 'part', partitioner=partitioned_variables.fixed_size_partitioner( 2)): x = array_ops.placeholder(dtypes.float32, shape=[2]) _ = quant_ops.MovingAvgQuantize(x, init_min=0.0, init_max=0.0, is_training=True, vars_collection=_MIN_MAX_VARS)
def weights(input_size, hidden_sizes, name='mnist', reuse=None, num_bits=8): with tf.variable_scope(name, reuse=reuse): sizes = hidden_sizes + [NUM_CLASSES] w0 = tf.get_variable( 'w0', (input_size, sizes[0]), initializer=tf.contrib.layers.xavier_initializer()) ws = [w0] + [ tf.get_variable('w' + str(i + 1), [sizes[i], sizes[i + 1]], initializer=tf.contrib.layers.xavier_initializer()) for i in range(len(hidden_sizes)) ] weight_decay = tf.multiply(sum([tf.nn.l2_loss(w) for w in ws]), WEIGHT_DECAY, name='weight_loss') tf.add_to_collection('losses', weight_decay) ws = [quant_ops.MovingAvgQuantize(w, num_bits=num_bits) for w in ws] return ws
def testMovingAvgQuantizeTrainingAssign(self): g = ops.Graph() with session.Session(graph=g) as sess: x = array_ops.placeholder(dtypes.float32, shape=[2]) y = quant_ops.MovingAvgQuantize(x, init_min=0.0, init_max=0.0, is_training=True, vars_collection=_MIN_MAX_VARS) # Run the step. sess.run(variables.global_variables_initializer()) # Do two runs to avoid zero debias. sess.run(y, feed_dict={x: [-1.0, 1.0]}) sess.run(y, feed_dict={x: [0.0, 0.0]}) # Now check that the min_max_vars were, in fact, updated. min_value, max_value = self._GetMinMaxValues(sess) self.assertGreater(min_value, -1.0) self.assertLess(min_value, 0.0) self.assertGreater(max_value, 0.0) self.assertLess(max_value, 1.0)
def _InsertQuantOp(context, name, producer, consumers, is_training, moving_avg=True, init_min=-6.0, init_max=6.0, bits=8, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, narrow_range=False): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new quantization op within the context. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. is_training: Whether quantizing training graph or eval graph. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. bits: Number of bits to use for quantization, must be between 2 and 8. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ name_prefix = _AddContextToName(context, name) # This is needed on TPU where name_scope == 'TPUReplicate/loop', and # name_prefix starts with 'TPUReplicate/loop/'; without dropping it # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which # breaks things later. name_prefix = common.DropStringPrefix(name_prefix, ops.get_name_scope() + '/') inputs = producer.outputs[0] if moving_avg: quant = (quant_ops.MovingAvgQuantize(inputs, init_min=init_min, init_max=init_max, ema_decay=ema_decay, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) else: quant = (quant_ops.LastValueQuantize(inputs, init_min=init_min, init_max=init_max, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) if quant_delay and quant_delay > 0: activate_quant = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), quant_delay, name=name_prefix + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=name_prefix + '/delayed_quant') nodes_modified_count = graph_editor.reroute_ts([quant], [inputs], can_modify=consumers) if nodes_modified_count != len(consumers): raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))
def _InsertQuantOp(context, name, producer, consumers, is_training, moving_avg=True, init_min=-6.0, init_max=6.0, bits=8, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, narrow_range=False, producer_scope=None, consumer_scope=None): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new quantization op within the context. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. is_training: Whether quantizing training graph or eval graph. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. bits: Number of bits to use for quantization, must be between 2 and 8. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. producer_scope: The restriction of producer scope. If not None, the new op will be inserted only when the producer is in this scope. consumer_scope: The restriction of producer scope. If not None, the new op will be inserted only when all the consumers are in this scope. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ if producer_scope and not producer.name.startswith(producer_scope): logging.info( '_InsertQuantOp ignores context="%s" name="%s" ' 'because producer "%s" is not in scope "%s"', context, name, producer.name, producer_scope) return if consumer_scope: consumers_in_scope = [] for consumer in consumers: if consumer.name.startswith(consumer_scope): consumers_in_scope.append(consumer) else: logging.info( '_InsertQuantOp context="%s" name="%s" ignores ' 'consumer "%s" because it is not in scope "%s"', context, name, consumer.name, consumer_scope) return consumers = consumers_in_scope name_prefix = _AddContextToName(context, name) # This is needed on TPU where name_scope == 'TPUReplicate/loop', and # name_prefix starts with 'TPUReplicate/loop/'; without dropping it # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which # breaks things later. name_scope = ops.get_name_scope() if name_scope: name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/') inputs = producer.outputs[0] # Prevent ops from being quantized multiple times. Bypass ops can sometimes # overlap between multiple matches, so we need to ensure that we don't # add duplicate FakeQuant operations. fake_quant_ops = set( ['FakeQuantWithMinMaxVars', 'FakeQuantWithMinMaxArgs']) if fake_quant_ops.intersection(set([c.type for c in inputs.consumers()])): return if moving_avg: quant = (quant_ops.MovingAvgQuantize(inputs, init_min=init_min, init_max=init_max, ema_decay=ema_decay, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) else: quant = (quant_ops.LastValueQuantize(inputs, init_min=init_min, init_max=init_max, is_training=is_training, num_bits=bits, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) if quant_delay and quant_delay > 0: activate_quant = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), quant_delay, name=name_prefix + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=name_prefix + '/delayed_quant') if consumers: tensors_modified_count = graph_editor.reroute_ts([quant], [inputs], can_modify=consumers) # Some operations can have multiple output tensors going to the same # consumer. Since consumers is a set, we need to ensure that # tensors_modified_count is greater than or equal to the length of the set # of consumers. if tensors_modified_count < len(consumers): raise ValueError( 'No inputs quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))
def _InsertQuantOp( self, context, producer, consumers, name, moving_avg=True, init_min=-6.0, init_max=6.0, delay_requested=True, bits=8, narrow_range=False, ): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. name: Name for the new quantization op within the context. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. delay_requested: If true, implement quantization delay where needed. False value explicitly disables delay quantization everywhere. bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ scope = context + '/' + name inputs = producer.outputs[0] if moving_avg: quant = (quant_ops.MovingAvgQuantize( inputs, init_min=init_min, init_max=init_max, ema_decay=self.ema_decay, is_training=self.is_training, num_bits=bits, narrow_range=narrow_range, updates_collection=_UPDATE_QUANT_OPS, vars_collection=self.vars_collection, scope=scope)) else: quant = (quant_ops.LastValueQuantize( inputs, init_min=init_min, init_max=init_max, is_training=self.is_training, num_bits=bits, narrow_range=narrow_range, updates_collection=_UPDATE_QUANT_OPS, vars_collection=self.vars_collection, scope=scope)) if delay_requested and self.quant_delay and self.quant_delay > 0: activate_quant = math_ops.greater_equal( training_util.get_or_create_global_step(), self.quant_delay, name=scope + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=scope + '/delayed_quant') nodes_modified_count = graph_editor.reroute_ts([quant], [inputs], can_modify=consumers) if nodes_modified_count != len(consumers): raise ValueError( 'Some inputs not quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))
def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor, bottleneck_tensor_size, quantize_layer): """ Adds a new softmax and fully-connected layer for training. """ with tf.name_scope('input'): bottleneck_input = tf.placeholder_with_default(bottleneck_tensor, shape=[None, bottleneck_tensor_size], name='BottleneckInputPlaceholder') ground_truth_input = tf.placeholder(tf.int64, [None], name='GroundTruthInput') # end with # Organizing the following ops as `final_training_ops` so they're easier to see in TensorBoard layer_name = 'final_training_ops' with tf.name_scope(layer_name): quantized_layer_weights = None quantized_layer_biases = None with tf.name_scope('weights'): initial_value = tf.truncated_normal([bottleneck_tensor_size, class_count], stddev=0.001) layer_weights = tf.Variable(initial_value, name='final_weights') if quantize_layer: quantized_layer_weights = quant_ops.MovingAvgQuantize(layer_weights, is_training=True) attachTensorBoardSummaries(quantized_layer_weights) # end if # this comment is necessary to suppress an unnecessary PyCharm warning # noinspection PyTypeChecker attachTensorBoardSummaries(layer_weights) # end with with tf.name_scope('biases'): layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases') if quantize_layer: quantized_layer_biases = quant_ops.MovingAvgQuantize(layer_biases, is_training=True) attachTensorBoardSummaries(quantized_layer_biases) # end if # this comment is necessary to suppress an unnecessary PyCharm warning # noinspection PyTypeChecker attachTensorBoardSummaries(layer_biases) # end with with tf.name_scope('Wx_plus_b'): if quantize_layer: logits = tf.matmul(bottleneck_input, quantized_layer_weights) + quantized_layer_biases logits = quant_ops.MovingAvgQuantize(logits, init_min=-32.0, init_max=32.0, is_training=True, num_bits=8, narrow_range=False, ema_decay=0.5) tf.summary.histogram('pre_activations', logits) else: logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases tf.summary.histogram('pre_activations', logits) # end if # end with # end with final_tensor = tf.nn.softmax(logits, name=final_tensor_name) tf.summary.histogram('activations', final_tensor) with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(labels=ground_truth_input, logits=logits) # end with tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'): optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE) train_step = optimizer.minimize(cross_entropy_mean) # end with return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input, final_tensor)
def add_final_training_ops(self, class_count, final_tensor_name, bottleneck_tensor, bottleneck_tensor_size, quantize_layer): """ Adds a new softmax and fully-connected layer for training. We need to retrain the top layer to identify our new classes, so this function adds the right operations to the graph, along with some variables to hold the weights, and then sets up all the gradients for the backward pass. The set up for the softmax and fully-connected layers is based on: https://www.tensorflow.org/versions/master/tutorials/mnist/beginners/index.html Args: class_count: Integer of how many categories of things we're trying to recognize. final_tensor_name: Name string for the new final node that produces results. bottleneck_tensor: The output of the main CNN graph. bottleneck_tensor_size: How many entries in the bottleneck vector. quantize_layer: Boolean, specifying whether the newly added layer should be quantized. Returns: The tensors for the training and cross entropy results, and tensors for the bottleneck input and ground truth input. """ with tf.name_scope('input'): bottleneck_input = tf.placeholder_with_default( bottleneck_tensor, shape=[None, bottleneck_tensor_size], name='BottleneckInputPlaceholder') ground_truth_input = tf.placeholder( tf.int64, [None], name='GroundTruthInput') global_step = tf.Variable(0, trainable=False, name='global_step') # Organizing the following ops as `final_training_ops` so they're easier # to see in TensorBoard layer_name = 'final_training_ops' with tf.name_scope(layer_name): with tf.name_scope('weights'): initial_value = tf.truncated_normal( [bottleneck_tensor_size, class_count], stddev=0.001) layer_weights = tf.Variable(initial_value, name='final_weights') if quantize_layer: quantized_layer_weights = quant_ops.MovingAvgQuantize( layer_weights, is_training=True) self.variable_summaries(quantized_layer_weights) self.variable_summaries(layer_weights) with tf.name_scope('biases'): layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases') if quantize_layer: quantized_layer_biases = quant_ops.MovingAvgQuantize( layer_biases, is_training=True) self.variable_summaries(quantized_layer_biases) self.variable_summaries(layer_biases) with tf.name_scope('Wx_plus_b'): if quantize_layer: logits = tf.matmul(bottleneck_input, quantized_layer_weights) + quantized_layer_biases logits = quant_ops.MovingAvgQuantize( logits, init_min=-32.0, init_max=32.0, is_training=True, num_bits=8, narrow_range=False, ema_decay=0.5) tf.summary.histogram('pre_activations', logits) else: logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases tf.summary.histogram('pre_activations', logits) final_tensor = tf.nn.softmax(logits, name=final_tensor_name) tf.summary.histogram('activations', final_tensor) with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'): optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) train_step = optimizer.minimize(cross_entropy_mean, global_step=global_step) return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input, final_tensor, global_step)
def _InsertQuantOp(context, name, producer, consumers, is_training, moving_avg=True, init_min=-6.0, init_max=6.0, bits=8, symmetric=False, ema_decay=0.999, quant_delay=None, vars_collection=ops.GraphKeys.GLOBAL_VARIABLES, narrow_range=False, producer_scope=None, consumer_scope=None): """Inserts a quant op between a producer op and (multiple) consumer ops. Args: context: Context where producer and consumer operations are nested. name: Name for the new quantization op within the context. producer: Producer operation of the pairs where quantization will be inserted. consumers: Consumer operations of the pairs. is_training: Whether quantizing training graph or eval graph. moving_avg: Specifies whether to use exponential moving average or just the last value seen. init_min: Starting minimum value for the new quantization op. init_max: Starting maximum value for the new quantization op. bits: Number of bits to use for quantization, must be between 2 and 8. symmetric: (Optional) If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. ema_decay: (Optional) Float, EMA decay parameter. EMA is used to update quantization intervals for quantizing activations (see here about EMA: https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average). quant_delay: (Optional, default None) Int, count of global steps for which to delay quantization. This helps weights stabilize at the start of training. vars_collection: (Optional) Collection where to store the variables for quantization interval ends. narrow_range: Whether to use the narrow quantization range [1; 2^bits - 1] or wide range [0; 2^bits - 1]. producer_scope: The restriction of producer scope. If not None, the new op will be inserted only when the producer is in this scope. consumer_scope: The restriction of producer scope. If not None, the new op will be inserted only when all the consumers are in this scope. Raises: ValueError: When producer operation is not directly connected to the consumer operation. """ if producer_scope and not producer.name.startswith(producer_scope): logging.info( '_InsertQuantOp ignores context="%s" name="%s" ' 'because producer "%s" is not in scope "%s"', context, name, producer.name, producer_scope) return if consumer_scope: consumers_in_scope = [] for consumer in consumers: if consumer.name.startswith(consumer_scope): consumers_in_scope.append(consumer) else: logging.info( '_InsertQuantOp context="%s" name="%s" ignores ' 'consumer "%s" because it is not in scope "%s"', context, name, consumer.name, consumer_scope) return consumers = consumers_in_scope name_prefix = _AddContextToName(context, name) # This is needed on TPU where name_scope == 'TPUReplicate/loop', and # name_prefix starts with 'TPUReplicate/loop/'; without dropping it # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which # breaks things later. name_scope = ops.get_name_scope() if name_scope: name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/') inputs = producer.outputs[0] # Prevent ops from being quantized multiple times. Bypass ops can sometimes # overlap between multiple matches, so we need to ensure that we don't # add duplicate FakeQuant operations. fake_quant_op = _GetFollowingFakeQuantOp(inputs) # If we find that we are attempting to insert a fake quant op following # a fake quant, we skip inserting a fake quant op if fake_quant_op is None: if moving_avg: quant = (quant_ops.MovingAvgQuantize( inputs, init_min=init_min, init_max=init_max, ema_decay=ema_decay, is_training=is_training, num_bits=bits, symmetric=symmetric, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) else: quant = (quant_ops.LastValueQuantize( inputs, init_min=init_min, init_max=init_max, is_training=is_training, num_bits=bits, symmetric=symmetric, narrow_range=narrow_range, vars_collection=vars_collection, name_prefix=name_prefix)) if quant_delay and quant_delay > 0: activate_quant = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), quant_delay, name=name_prefix + '/activate_quant') quant = control_flow_ops.cond(activate_quant, lambda: quant, lambda: inputs, name=name_prefix + '/delayed_quant') else: # return # If a fake quant op is present already, make sure that # any downstream use of the tensor reroutes to the appropriate quantized # tensor. If there is no quant_delay, this is simply the output of the # fake quant op. If there is a quant delay, we reroute to the output # of the delayed quant operation, which inserts quantization only after # a specified quant_delay quant = fake_quant_op.outputs[0] if quant_delay and quant_delay > 0: name_prefix = '/'.join(quant.name.split('/')[:-1]) quant = quant.graph.get_tensor_by_name(name_prefix + '/delayed_quant/Merge:0') pruned_consumer_set = set() for consumer in consumers: fake_quant_dest_op = _GetFollowingFakeQuantOp(consumer.outputs[0]) if (fake_quant_dest_op is None or fake_quant_dest_op.name != fake_quant_op.name): pruned_consumer_set.add(consumer) consumers = pruned_consumer_set # If we have # input->pass_through->fake_quant # there is nothing to reroute. # # If we have # input-> pass_through->fake_quant # |-> consumer # Then we reroute such that: # input-> pass_through->fake_quant # |-> consumer if consumers: tensors_modified_count = common.RerouteTensor(quant, inputs, can_modify=consumers) # Some operations can have multiple output tensors going to the same # consumer. Since consumers is a set, we need to ensure that # tensors_modified_count is greater than or equal to the length of the set # of consumers. if tensors_modified_count < len(consumers): raise ValueError( 'No inputs quantized for ops: [%s]' % ', '.join([consumer.name for consumer in consumers]))