def LastValueLogThQuantize(inputs, log_th_var, min_var, max_var, bit_width, is_training, mode, name_scope="LastValueLogThQuantize"): """Last value power of 2 quantize op with log threshold. """ with tf.name_scope(name_scope): # ANALYSE branch if mode == 'ANALYSE': batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return tf.identity(inputs, name='identity') if is_training or mode == 'QCB': # Training and calibration branch batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return fake_quantize_with_log_th(inputs, log_th_var, bit_width) else: # Evaluation branch return fake_quantize_with_log_th(inputs, log_th_var, bit_width)
def LastValueQuantPosQuantize(inputs, quant_pos_var, min_var, max_var, bit_width, method, is_training, mode, round_mode, name_scope="LastValueQuantPosQuantize"): """Last value power of 2 quantize op with quantize position. """ with tf.name_scope(name_scope): # ANALYSE branch if mode == 'ANALYSE': batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return tf.identity(inputs, name='identity') if is_training or mode == 'QCB': # Training and calibration branch batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') batch_quantize_pos = get_quantize_pos(inputs, assign_min, assign_max, bit_width, method) assign_quantize_pos = tf_compat.assign(quant_pos_var, batch_quantize_pos, name="assign_quantize_pos") if round_mode == 0: return fake_quantize_with_quantize_pos_std( inputs, assign_quantize_pos, bit_width) elif round_mode == 1: return fake_quantize_with_quantize_pos_dpu( inputs, assign_quantize_pos, bit_width) else: raise ValueError('Invalid round mode: {}'.format(round_mode)) else: # Evaluation branch if round_mode == 0: return fake_quantize_with_quantize_pos_std( inputs, quant_pos_var, bit_width) elif round_mode == 1: return fake_quantize_with_quantize_pos_dpu( inputs, quant_pos_var, bit_width) else: raise ValueError('Invalid round mode: {}'.format(round_mode))
def update(): assign_objs = [] for weight, mask, threshold in self._pruning_vars: new_threshold, new_mask = self._maybe_update_block_mask(weight) assign_objs.append(tf_compat.assign(threshold, new_threshold)) assign_objs.append(tf_compat.assign(mask, new_mask)) return tf.group(assign_objs)
def LastValueMinMaxQuantize(inputs, min_var, max_var, bit_width, is_training, mode, name_scope="LastValueMinMaxQuantize"): """Last value float scale quantize op. """ with tf.name_scope(name_scope): # ANALYSE branch if mode == 'ANALYSE': batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return tf.identity(inputs, name='identity') if is_training or mode == 'QCB': # Training and calibration branch batch_min, batch_max = get_min_max(inputs) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return fake_quantize_with_min_max(inputs, assign_min, assign_max, bit_width) else: # Evaluation branch return fake_quantize_with_min_max(inputs, min_var, max_var, bit_width)
def _weight_assign_objs(self): """Gather the assign objs for assigning weights<=weights*mask. The objs are ops for graph execution and tensors for eager execution. Returns: group of objs for weight assignment. """ def update_fn(distribution, values_and_vars): # TODO(yunluli): Need this ReduceOp because the weight is created by the # layer wrapped, so we don't have control of its aggregation policy. May # be able to optimize this when distribution strategy supports easier # update to mirrored variables in replica context. reduced_values = distribution.extended.batch_reduce_to( tf.distribute.ReduceOp.MEAN, values_and_vars) var_list = [v for _, v in values_and_vars] values_and_vars = zip(reduced_values, var_list) def update_var(variable, reduced_value): return tf_compat.assign(variable, reduced_value) update_objs = [] for value, var in values_and_vars: update_objs.append( distribution.extended.update(var, update_var, args=(value, ))) return tf.group(update_objs) assign_objs = [] if tf.distribute.get_replica_context(): values_and_vars = [] for weight, mask, _ in self._pruning_vars: masked_weight = tf.math.multiply(weight, mask) values_and_vars.append((masked_weight, weight)) if values_and_vars: assign_objs.append( tf.distribute.get_replica_context().merge_call( update_fn, args=(values_and_vars, ))) else: for weight, mask, _ in self._pruning_vars: masked_weight = tf.math.multiply(weight, mask) assign_objs.append(tf_compat.assign(weight, masked_weight)) return assign_objs
def increment_step(): with tf.control_dependencies( [tf_compat.assign(self.pruning_step, self.pruning_step + 1)]): return tf.no_op('update')
def increment_step(): return tf_compat.assign(self.pruning_step, self.pruning_step + 1)
def update(var, value): return tf_compat.assign(var, value)
def update_var(variable, reduced_value): return tf_compat.assign(variable, reduced_value)
def TQTQuantize(inputs, log_th_var, min_var, max_var, bit_width, method, round_mode, mode, is_training, symmetry, per_channel, channel_axis, narrow_range=False, name_scope="TQTQuantize"): """Power-of-2 quantize op with log threshold. Args: inputs: Input values. log_th_var: Variable of log threshold. min_var: Variable of minimum value of inputs. max_var: Variable of maximum value of inputs. bit_width: Int, bit width of quantized values. method: Int Enum, method of how to get the initial log threshold, 0 for non_overflow. round_mode: Int, the mode of rounding function, 0 for HALF_TO_EVEN, 1 for HALF_UP, 2 for HALF_AWAY_FROM_ZERO. mode: String, the mode of quantization, available modes are ['ANALYSE', 'QCB', 'QCBEV', 'QAT'] is_training: Bool, whether in training phase. symmetry: Bool, whether to apply symmetry quantization. per_channel: Bool, whether to apply per_channel quantization. channel_axis: The axis of the channel, used with per_channel enabled. The last dimension is regarded as channel axis and other dimension will be reduces by default. narrow_range: Bool, whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. Return: Quantized inputs. """ with tf.name_scope(name_scope): reduce_dims = None if per_channel: input_dims = len(inputs.get_shape()) reduce_dims = convert_channel_axis_to_reduce_dims(input_dims, channel_axis) quantize_kernel = TQTFakeQuantize( bit_width=bit_width, method=method, round_mode=round_mode, symmetry=symmetry, per_channel=per_channel, narrow_range=narrow_range, reduce_dims=reduce_dims) # ANALYSE branch if mode == 'ANALYSE': batch_min, batch_max = get_min_max( inputs, bit_width, symmetry=symmetry, per_channel=per_channel, narrow_range=narrow_range, reduce_dims=reduce_dims) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return tf.identity(inputs, name='identity') if is_training or mode == 'QCB': # Training and calibration branch batch_min, batch_max = get_min_max( inputs, bit_width, symmetry=symmetry, per_channel=per_channel, narrow_range=narrow_range, reduce_dims=reduce_dims) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') if mode == 'QCB': batch_log_th = quantize_kernel.get_log_th(inputs, assign_min, assign_max) assign_log_th = tf_compat.assign( log_th_var, batch_log_th, name="assign_log_th") return quantize_kernel.call(inputs, assign_log_th, assign_min, assign_max) else: return quantize_kernel.call(inputs, log_th_var, assign_min, assign_max) else: # Evaluation branch return quantize_kernel.call(inputs, log_th_var, min_var, max_var)
def FSQuantize( inputs, min_var, max_var, calib_hist, calib_bin_edges, bit_width, method, round_mode, mode, is_training, symmetry, per_channel, channel_axis, use_framework_quant=True, narrow_range=False, name_scope="FSQuantize", ): """Float scale quantize op. Args: inputs: Input values. min_var: Variable of minimum value of inputs. max_var: Variable of maximum value of inputs. calib_hist: Variable of histogram of inputs. calib_bin_edges: Variable of linspace of inputs. bit_width: Int, bit width of quantized values. method: method of quantize valued of inputs, round_mode: Int, the mode of rounding function, 0 for HALF_TO_EVEN, 1 for HALF_UP, 2 for HALF_AWAY_FROM_ZERO. mode: String, the mode of quantization, available modes are ['ANALYSE', 'QCB', 'QCBEV', 'QAT'] is_training: Bool, whether in training phase. symmetry: Bool, whether to apply symmetry quantization. per_channel: Bool, whether to apply per_channel quantization. channel_axis: The axis of the channel, used with per_channel enabled. The last dimension is regarded as channel axis and other dimension will be reduces by default. use_framework_quant: Bool, whether to use the tensorflow fake_quantize operations. If not, the custom quantize kernel will be used. narrow_range: Bool, whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. Return: Quantized inputs. """ with tf.name_scope(name_scope): reduce_dims = None if per_channel: input_dims = len(inputs.get_shape()) reduce_dims = convert_channel_axis_to_reduce_dims(input_dims, channel_axis) quantize_kernel = FSFakeQuantize( bit_width=bit_width, round_mode=round_mode, symmetry=symmetry, per_channel=per_channel, use_framework_quant=use_framework_quant, narrow_range=narrow_range, reduce_dims=reduce_dims) # ANALYSE branch if mode == 'ANALYSE': batch_min, batch_max = get_min_max( inputs, bit_width, method, symmetry=symmetry, per_channel=per_channel, narrow_range=narrow_range, reduce_dims=reduce_dims) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return tf.identity(inputs, name='identity') if is_training or mode == 'QCB': # Training and calibration branch batch_min = None batch_max = None method = QuantizeMethod(method) if method == QuantizeMethod.NON_OVERFLOW or method == QuantizeMethod.MIN_MSE: batch_min, batch_max = get_min_max( inputs, bit_width, method, symmetry=symmetry, per_channel=per_channel, narrow_range=narrow_range, reduce_dims=reduce_dims) #if not per_channel: batch_min = tf.math.minimum(min_var, batch_min) batch_max = tf.math.maximum(max_var, batch_max) assign_min = tf_compat.assign(min_var, batch_min, name='assign_min') assign_max = tf_compat.assign(max_var, batch_max, name='assign_max') return quantize_kernel.call(inputs, assign_min, assign_max) elif method == QuantizeMethod.MIN_KL: _calib_hist, _calib_bin_edges = calibrator_numpy.numpy_collect( inputs, calib_hist, calib_bin_edges) calib_hist = tf_compat.assign( calib_hist, _calib_hist, name='calib_hist') calib_bin_edges = tf_compat.assign( calib_bin_edges, _calib_bin_edges, name='calib_bin_edges') return tf.identity(inputs, name='identity') elif method == QuantizeMethod.PERCENTILE: _calib_hist, _calib_bin_edges = calibrator_numpy.numpy_collect( inputs, calib_hist, calib_bin_edges) calib_hist = tf_compat.assign( calib_hist, _calib_hist, name='calib_hist') calib_bin_edges = tf_compat.assign( calib_bin_edges, _calib_bin_edges, name='calib_bin_edges') return tf.identity(inputs, name='identity') else: logger.error('Invalid method: {}'.format(method)) return tf.identity(inputs, name='identity') else: # Evaluation branch return quantize_kernel.call(inputs, min_var, max_var)
def AllValuesQuantize(inputs, min_var, max_var, name_prefix='AllValuesQuantize', is_training=True, num_bits=8, narrow_range=False, symmetric=False): """Adds a layer that collects quantization ranges as min/max of tensor values. AllValuesQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. min_var: Variable which stores the min value of tensor. max_var: Variable which stores the max value of tensor. name_prefix: name_prefix for created nodes. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. symmetric: If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. Returns: a tensor containing quantized values. """ with tf.name_scope(name_prefix): if not is_training: return _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel=False, num_bits=num_bits, narrow_range=narrow_range) batch_min = tf.math.reduce_min(inputs, name='BatchMin') batch_max = tf.math.reduce_max(inputs, name='BatchMax') if symmetric: if narrow_range: min_max_ratio = -1 else: # In two's complement notation, the negative range is slightly larger # than the positive range. min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits) # TFLite requires that 0.0 is always in the [min; max] range. Because # batch_min <= batch_max, it follows that range_min <= 0 <= range_max. batch_min = tf.math.minimum(batch_min, batch_max / min_max_ratio) batch_max = tf.math.maximum(batch_max, batch_min * min_max_ratio) # TFLite requires that 0.0 if always in the [min; max] range. range_min = tf.math.minimum(tf.math.minimum(min_var, batch_min), 0.0) range_max = tf.math.maximum(tf.math.maximum(max_var, batch_max), 0.0) assign_min = tf_compat.assign(min_var, range_min, name='AssignMinAllValue') assign_max = tf_compat.assign(max_var, range_max, name='AssignMaxAllValue') return _FakeQuantWithMinMaxVars(inputs, assign_min, assign_max, per_channel=False, num_bits=num_bits, narrow_range=narrow_range)
def LastValueQuantize(inputs, min_var, max_var, per_channel=False, name_prefix='LastValueQuant', is_training=True, num_bits=8, narrow_range=False, symmetric=False): """Adds a layer that collects quantization ranges as last input ranges. LastValueQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (Optional) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. name_prefix: name_prefix for created nodes. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. symmetric: If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. Returns: a tensor containing quantized values. """ with tf.name_scope(name_prefix): input_shape = inputs.get_shape() input_dim = len(input_shape) if not is_training: return _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = tf.math.reduce_min(inputs, axis=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = tf.math.reduce_min(inputs, name='BatchMin') if per_channel: if input_dim >= 2: batch_max = tf.math.reduce_max(inputs, axis=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = tf.math.reduce_max(inputs, name='BatchMax') if symmetric: if narrow_range: min_max_ratio = -1 else: # In two's complement notation, the negative range is slightly larger # than the positive range. min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits) # TFLite requires that 0.0 if always in the [min; max] range. Because # batch_min <= batch_max, it follows that range_min <= 0 <= range_max. range_min = tf.math.minimum(batch_min, batch_max / min_max_ratio) range_max = tf.math.maximum(batch_max, batch_min * min_max_ratio) else: # TFLite requires that 0.0 if always in the [min; max] range. range_min = tf.math.minimum(batch_min, 0.0) range_max = tf.math.maximum(batch_max, 0.0) assign_min = tf_compat.assign(min_var, range_min, name='AssignMinLast') assign_max = tf_compat.assign(max_var, range_max, name='AssignMaxLast') return _FakeQuantWithMinMaxVars(inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)