def do_quantize(self, blob, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: return blob blob_save = blob if isinstance(blob.values, torch.Tensor): blob = blob.values quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if blob.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): quant_data = nndct_quant.QuantizeData(name, blob.cpu().detach().numpy()) # quantize the tensor bnfp = self.get_bnfp(name, True, tensor_type) #print('---- quant %s with 1/step = %g' % (name, bnfp[1])) # hardware cut method mth = 4 if self.lstm else 2 if tensor_type == 'param': mth = 3 res = py_nndct.nn.NndctFixNeuron(blob, blob, maxamp=[bnfp[0], bnfp[1]], method=mth) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( blob.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr print( f"quant_efficiency={quant_efficiency}, global_snr_inv={global_snr_inv} {quant_data._name}\n" ) # update param to nndct graph if tensor_type == 'param': self.update_param_to_nndct(node, name, res.cpu().detach().numpy()) blob = blob_save res = blob_save return res
def do_scan(self, res, name, node=None, tensor_type='input'): # keep quantization steps after fast finetune if self.keep_fp: return self.do_quantize(res, name, node, tensor_type) # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return res else: return res.clone().detach() res_save = None if isinstance(res.values, torch.Tensor): res_save = res res = res.values.data if res.dtype != torch.float32 and res.dtype != torch.double: NndctScreenLogger().warning_once( f'The tensor type of {node.name} is {str(res.dtype)}. Only support float32/double quantization.' ) return res_save if res_save is not None else res quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if res.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) # get fixed position bnfp = self.get_quant_config(name, False, tensor_type) # hardware cut method mth = 4 if self.lstm else 2 if NndctOption.nndct_use_torch_quantizer.value is True: mth = -1 elif tensor_type == 'param': mth = 3 scope = 5 if NndctOption.nndct_diffs_mode.value == "mse" else 1 # set fix pos scanning scope to 1 for some type of tensors if (node.op.type in [NNDCT_OP.INPUT, NNDCT_OP.QUANT_STUB]): scope = 1 if (self.lstm and tensor_type == 'input'): scope = 1 res = res.detach().clone() Tbuffer = torch.empty_like(res).to(quant_device) Tfixpos = torch.tensor( [1], dtype=torch.get_default_dtype()).to(quant_device) # activation always calculate fix pos # calcualte fix pos if it is None # always calculate fis pos in finetune mode if tensor_type != 'param' or bnfp[1] is None or self.quant_mode == 3: py_nndct.nn.NndctDiffsFixPos(Tinput=res, Tbuffer=Tbuffer, Tfixpos=Tfixpos, bit_width=bnfp[0], range=scope, method=mth) bnfp[1] = (int)(Tfixpos.item()) # limit max fix pos to 12 if bit width <= 8, others limit to 15 if bnfp[0] <= 8 or self.lstm: max_fp = NndctOption.nndct_max_fix_position.value bnfp[1] = min(max_fp, bnfp[1]) else: bnfp[1] = min(15, bnfp[1]) # record fix pos of activation if tensor_type != 'param': self.config_history[tensor_type][name].append(bnfp[1]) if (NndctOption.nndct_stat.value > 1): print( f'---- fp history: {stats.mode(np.array(self.config_history[tensor_type][name]))}' ) data = np.array(self.config_history[tensor_type][name]) bnfp[1] = stats.mode(data)[0][0] bnfp[1] = bnfp[1].astype(np.int32).tolist() self.set_quant_config(name, bnfp, tensor_type) if (NndctOption.nndct_stat.value > 1): print('---- quant %s tensor: %s with bw = %d and fp = %g' % (tensor_type, name, bnfp[0], bnfp[1])) # get 2^bit_width and 2^fracpos bnfp = self.get_quant_config(name, True, tensor_type) if (NndctOption.nndct_stat.value > 2): quant_data = nndct_quant.QuantizeData( name, res.cpu().detach().numpy()) # do quantization for parameter or activation res = fake_quantize_per_tensor(res, bnfp[1], 0, -bnfp[0], bnfp[0] - 1, mth, self.inplace) if (NndctOption.nndct_stat.value > 2): #quant_data.all_close(res.cpu().detach().numpy()) global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), math.log2(bnfp[0])) global_snr_inv += 1 / sqnr if quant_efficiency < 3.0: print( f"quant_efficiency={quant_efficiency}, {quant_data._name}\n" ) print('Statistic [Min, Max, Mean, Std]:') print('[{}, {}, {}, {}]'.format(res.min(), res.max(), res.mean(), res.std())) print('histogram: {}'.format( res.histc(bins=10).cpu().detach().numpy())) t = res if tensor_type != 'param': t = res.transpose(0, 1) print('Channel number:{}'.format(t.shape[0])) print('Channel-wise statistic [Min, Max, Mean, Std]:') for c in range(t.shape[0]): print('[{}, {}, {}, {}]'.format( t[c].min(), t[c].max(), t[c].mean(), t[c].std())) print('histogram: {}'.format( t[c].histc(bins=10).cpu().detach().numpy())) if res_save is not None: res_save.values.data = res res = res_save return res
def do_quantize(self, blob, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: if self.inplace: return blob else: return blob.clone().detach() blob_save = None if isinstance(blob.values, torch.Tensor): blob_save = blob blob = blob.values.data if blob.dtype != torch.float32 and blob.dtype != torch.double: NndctScreenLogger().warning_once( f'The tensor type of {node.name} is {str(blob.dtype)}. Only support float32/double quantization.' ) return blob_save if blob_save is not None else blob quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if blob.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) if (NndctOption.nndct_stat.value > 2): quant_data = nndct_quant.QuantizeData(name, blob.cpu().detach().numpy()) # quantize the tensor bnfp = self.get_quant_config(name, True, tensor_type) if (NndctOption.nndct_stat.value > 1): print('---- quant %s tensor: %s with 1/step = %g' % (tensor_type, name, bnfp[1])) # hardware cut method mth = 4 if self.lstm else 2 if NndctOption.nndct_use_torch_quantizer.value is True: mth = -1 elif tensor_type == 'param': mth = 3 res = fake_quantize_per_tensor(blob, bnfp[1], 0, -bnfp[0], bnfp[0] - 1, mth, self.inplace) if (NndctOption.nndct_stat.value > 2): global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr if quant_efficiency < 3.0: print( f"quant_efficiency={quant_efficiency}, global_snr_inv={global_snr_inv} {quant_data._name}\n" ) print( 'Network input channel-wise statistic [Min, Max, Mean, Std]:' ) print('[{}, {}, {}, {}]'.format(res.min(), res.max(), res.mean(), res.std())) print('histogram: {}'.format( res.histc(bins=10).cpu().detach().numpy())) t = res if tensor_type != 'param': t = res.transpose(0, 1) print('Channel number:{}'.format(t.shape[0])) print('Channel-wise statistic [Min, Max, Mean, Std]:') for c in range(t.shape[0]): print('[{}, {}, {}, {}]'.format(t[c].min(), t[c].max(), t[c].mean(), t[c].std())) print('histogram: {}'.format( t[c].histc(bins=10).cpu().detach().numpy())) # update param to nndct graph if tensor_type == 'param' and not self.exporting: self.update_param_to_nndct(node, name, res.cpu().detach().numpy()) if blob_save is not None: blob_save.values.data = res res = blob_save return res
def do_scan(self, res, name, node=None, tensor_type='input'): # forward quant graph but not quantize parameter and activation if NndctOption.nndct_quant_off.value: return res res_save = res if isinstance(res.values, torch.Tensor): res = res.values quant_device = GLOBAL_MAP.get_ele(NNDCT_KEYS.QUANT_DEVICE) if res.device.type != quant_device.type: raise TypeError( "Device of quantizer is {}, device of model and data should match device of quantizer" .format(quant_device.type)) # hardware cut method mth = 4 if self.lstm else 2 if tensor_type == 'param': mth = 3 range = 5 # set fix pos scanning range to 1 for some type of tensors if ((node.op.type in [NNDCT_OP.INPUT, NNDCT_OP.QUANT_STUB]) or (self.lstm and tensor_type == 'input')): range = 1 # get fixed position bnfp = self.get_bnfp(name, False, tensor_type) #if(res.device == torch.device("cpu")): if (quant_device.type == "cpu"): Tbuffer = torch.empty_like(res).to(torch.device("cpu")) Tfixpos = torch.tensor([1], dtype=torch.get_default_dtype()).to( torch.device("cpu")) else: Tbuffer = torch.empty_like(res).cuda() Tfixpos = torch.tensor([1], dtype=torch.get_default_dtype()).cuda() # activation always calculate fix pos # calcualte fix pos if it is None # always calculate fis pos in finetune mode if tensor_type != 'param' or bnfp[1] is None or self.quant_mode == 3: py_nndct.nn.NndctDiffsFixPos(Tinput=res, Tbuffer=Tbuffer, Tfixpos=Tfixpos, bit_width=bnfp[0], range=range, method=mth) bnfp[1] = (int)(Tfixpos.item()) # record fix pos of activation if tensor_type != 'param': self.fp_history[tensor_type][name].append(bnfp[1]) data = np.array(self.fp_history[tensor_type][name]) bnfp[1] = stats.mode(data)[0][0] bnfp[1] = bnfp[1].astype(np.int32).tolist() self.set_bnfp(name, bnfp, tensor_type) #print('---- quant %s with bw = %d and fp = %g' % (name, bnfp[0], bnfp[1])) # get 2^bit_width and 2^fracpos bnfp = self.get_bnfp(name, True, tensor_type) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): #if tensor_type == "param": quant_data = nndct_quant.QuantizeData( name, res.cpu().detach().numpy()) #print('---- quant %s with bw = %d and 1/step = %g' % (name, bnfp[0], bnfp[1])) # do quantization for parameter or activation res = py_nndct.nn.NndctFixNeuron(res, res, maxamp=[bnfp[0], bnfp[1]], method=mth) if (NndctOption.nndct_quant_opt.value and NndctOption.nndct_logging_level.value > 0): #if tensor_type == "param": global global_snr_inv quant_efficiency, sqnr = quant_data.quant_efficiency( res.cpu().detach().numpy(), 8) global_snr_inv += 1 / sqnr print( f"quant_efficiency={quant_efficiency}, {quant_data._name}\n" ) #print(f"quant_efficiency={quant_efficiency}, global_snr_inv={globacl_snr_inv} {quant_data._name}\n") res = res_save return res