def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr( config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size # Quantized implementations of torch.nn.Linear modules self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) # Additional quantizers that will be needed to quantize the inputs to the torch.matmul() operation in the # forward method. Since it's a simple operation and no quantized version of it exists, the inputs to this # operation could be manually quantized to realize a quantized mat-mul operation. self.matmul_q_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_k_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_v_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input) self.matmul_a_input_quantizer = TensorQuantizer( quant_nn.QuantLinear.default_quant_desc_input)
def __init__( self, channels: int, reduction_ratio: int, context_window: int = -1, interpolation_mode: str = 'nearest', activation: Optional[Callable] = None, quantize: bool = False, ): """ Squeeze-and-Excitation sub-module. Args: channels: Input number of channels. reduction_ratio: Reduction ratio for "squeeze" layer. context_window: Integer number of timesteps that the context should be computed over, using stride 1 average pooling. If value < 1, then global context is computed. interpolation_mode: Interpolation mode of timestep dimension. Used only if context window is > 1. The modes available for resizing are: `nearest`, `linear` (3D-only), `bilinear`, `area` activation: Intermediate activation function used. Must be a callable activation function. """ super(SqueezeExcite, self).__init__() self.interpolation_mode = interpolation_mode self._quantize = quantize self.pool = None # prepare a placeholder which will be updated if activation is None: activation = nn.ReLU(inplace=True) if PYTORCH_QUANTIZATION_AVAILABLE and quantize: self.fc = nn.Sequential( quant_nn.QuantLinear(channels, channels // reduction_ratio, bias=False), activation, quant_nn.QuantLinear(channels // reduction_ratio, channels, bias=False), ) elif not PYTORCH_QUANTIZATION_AVAILABLE and quantize: raise ImportError( "pytorch-quantization is not installed. Install from " "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." ) else: self.fc = nn.Sequential( nn.Linear(channels, channels // reduction_ratio, bias=False), activation, nn.Linear(channels // reduction_ratio, channels, bias=False), ) self.gap = nn.AdaptiveAvgPool1d(1) # Set default context window self.change_context_window(context_window=context_window) # Set default max sequence length self.set_max_len(16)
def __init__(self): super(Net, self).__init__() #self.conv1 = torch.nn.Conv2d(1, 32, (5, 5), padding=(2, 2), bias=True) # 换成对应的 Quantize 系列的 API self.conv1 = qnn.QuantConv2d(1, 32, (5, 5), padding=(2, 2), bias=True) #self.conv2 = torch.nn.Conv2d(32, 64, (5, 5), padding=(2, 2), bias=True) self.conv2 = qnn.QuantConv2d(32, 64, (5, 5), padding=(2, 2), bias=True) #self.fc1 = torch.nn.Linear(64 * 7 * 7, 1024, bias=True) self.fc1 = qnn.QuantLinear(64 * 7 * 7, 1024, bias=True) #self.fc2 = torch.nn.Linear(1024, 10, bias=True) self.fc2 = qnn.QuantLinear(1024, 10, bias=True)
def test_initialize_deactivate(self): no_replace_list = ["Linear"] custom_quant_modules = [(torch.nn, "Linear", quant_nn.QuantLinear)] quant_modules.initialize(no_replace_list, custom_quant_modules) assert (type(quant_nn.QuantLinear(16, 256, 3)) == type( torch.nn.Linear(16, 256, 3))) assert (type(quant_nn.QuantConv2d(16, 256, 3)) == type( torch.nn.Conv2d(16, 256, 3))) quant_modules.deactivate()
def test_simple_default_args(self): replacement_helper = QuantModuleReplacementHelper() replacement_helper.prepare_state() replacement_helper.apply_quant_modules() # Linear module should not be replaced with its quantized version assert (type(quant_nn.QuantLinear(16, 256, 3)) == type( torch.nn.Linear(16, 256, 3))) assert (type(quant_nn.QuantConv2d(16, 256, 3)) == type( torch.nn.Conv2d(16, 256, 3))) replacement_helper.restore_float_modules()
def test_with_no_replace_list(self): no_replace_list = ["Linear"] custom_quant_modules = None replacement_helper = QuantModuleReplacementHelper() replacement_helper.prepare_state(no_replace_list, custom_quant_modules) replacement_helper.apply_quant_modules() # Linear module should not be replaced with its quantized version assert (type(quant_nn.QuantLinear(16, 256, 3)) != type( torch.nn.Linear(16, 256, 3))) assert (type(quant_nn.QuantConv2d(16, 256, 3)) == type( torch.nn.Conv2d(16, 256, 3))) replacement_helper.restore_float_modules()
def test_with_custom_quant_modules(self): no_replace_list = ["Linear"] custom_quant_modules = [(torch.nn, "Linear", quant_nn.QuantLinear)] replacement_helper = QuantModuleReplacementHelper() replacement_helper.prepare_state(no_replace_list, custom_quant_modules) replacement_helper.apply_quant_modules() # Although no replace list indicates Linear module should not be replaced with its # quantized version, since the custom_quant_modules still contains the Linear module's # mapping, it will replaced. assert (type(quant_nn.QuantLinear(16, 256, 3)) == type( torch.nn.Linear(16, 256, 3))) assert (type(quant_nn.QuantConv2d(16, 256, 3)) == type( torch.nn.Conv2d(16, 256, 3))) replacement_helper.restore_float_modules()
def __init__( self, block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], quantize: bool = False, num_classes: int = 1000, zero_init_residual: bool = False, groups: int = 1, width_per_group: int = 64, replace_stride_with_dilation: Optional[List[bool]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None) -> None: super(ResNet, self).__init__() self._quantize = quantize if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer self.inplanes = 64 self.dilation = 1 if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format( replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group if quantize: self.conv1 = quant_nn.QuantConv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) else: self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0], quantize=quantize) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0], quantize=quantize) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1], quantize=quantize) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2], quantize=quantize) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) if quantize: self.fc = quant_nn.QuantLinear(512 * block.expansion, num_classes) else: self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type] elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]