Exemplos de AdaptiveAvgPool2d em Python, exemplos de oneflow.nn.AdaptiveAvgPool2d em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: q_alexnet.py Projeto: Oneflow-Inc/models

 def __init__(self, num_classes: int = 1000) -> None:
     super(QuantizationAlexNet, self).__init__()
     self.features = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(64, 192, kernel_size=5, padding=2),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         nn.Conv2d(192, 384, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(384, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(256, 256, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
     )
     self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
     self.classifier = nn.Sequential(
         nn.Dropout(),
         nn.Linear(256 * 6 * 6, 4096),
         nn.ReLU(inplace=True),
         nn.Dropout(),
         nn.Linear(4096, 4096),
         nn.ReLU(inplace=True),
         nn.Linear(4096, num_classes),
     )

Exemplo n.º 2

0

Exibir arquivo

Arquivo: of_model.py Projeto: Oneflow-Inc/models

 def __init__(self):
     super(Discriminator, self).__init__()
     self.net = nn.Sequential(
         nn.Conv2d(3, 64, kernel_size=3, padding=1),
         nn.LeakyReLU(0.2),
         nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
         nn.BatchNorm2d(64),
         nn.LeakyReLU(0.2),
         nn.Conv2d(64, 128, kernel_size=3, padding=1),
         nn.BatchNorm2d(128),
         nn.LeakyReLU(0.2),
         nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
         nn.BatchNorm2d(128),
         nn.LeakyReLU(0.2),
         nn.Conv2d(128, 256, kernel_size=3, padding=1),
         nn.BatchNorm2d(256),
         nn.LeakyReLU(0.2),
         nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1),
         nn.BatchNorm2d(256),
         nn.LeakyReLU(0.2),
         nn.Conv2d(256, 512, kernel_size=3, padding=1),
         nn.BatchNorm2d(512),
         nn.LeakyReLU(0.2),
         nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
         nn.BatchNorm2d(512),
         nn.LeakyReLU(0.2),
         nn.AdaptiveAvgPool2d(1),
         nn.Conv2d(512, 1024, kernel_size=1),
         nn.LeakyReLU(0.2),
         nn.Conv2d(1024, 1, kernel_size=1),
     )

Exemplo n.º 3

0

Exibir arquivo

 def __init__(self, input_channels: int, squeeze_factor: int = 4):
     super().__init__()
     squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8)
     self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1)
     self.relu = nn.ReLU(inplace=True)
     self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1)
     self.adaptive_avg_pool2d = nn.AdaptiveAvgPool2d(1)
     self.hardsigmoid = nn.Hardsigmoid(inplace=True)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: ghostnet.py Projeto: Oneflow-Inc/models

    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2):
        super(GhostNet, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = cfgs
        self.dropout = dropout

        # building first layer
        output_channel = _make_divisible(16 * width, 4)
        self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(output_channel)
        self.act1 = nn.ReLU(inplace=True)
        input_channel = output_channel

        # building inverted residual blocks
        stages = []
        block = GhostBottleneck
        for cfg in self.cfgs:
            layers = []
            for k, exp_size, c, se_ratio, s in cfg:
                output_channel = _make_divisible(c * width, 4)
                hidden_channel = _make_divisible(exp_size * width, 4)
                layers.append(
                    block(
                        input_channel,
                        hidden_channel,
                        output_channel,
                        k,
                        s,
                        se_ratio=se_ratio,
                    ))
                input_channel = output_channel
            stages.append(nn.Sequential(*layers))

        output_channel = _make_divisible(exp_size * width, 4)
        stages.append(
            nn.Sequential(ConvBnAct(input_channel, output_channel, 1)))
        input_channel = output_channel

        self.blocks = nn.Sequential(*stages)

        # building last several layers
        output_channel = 1280
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.conv_head = nn.Conv2d(input_channel,
                                   output_channel,
                                   1,
                                   1,
                                   0,
                                   bias=True)
        self.act2 = nn.ReLU(inplace=True)
        self.classifier = nn.Linear(output_channel, num_classes)
        self.dropout = nn.Dropout(p=self.dropout)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: inceptionv3.py Projeto: Oneflow-Inc/models

 def __init__(
     self,
     in_channels: int,
     num_classes: int,
     conv_block: Optional[Callable[..., nn.Module]] = None,
 ) -> None:
     super(InceptionAux, self).__init__()
     if conv_block is None:
         conv_block = BasicConv2d
     self.conv0 = conv_block(in_channels, 128, kernel_size=1)
     self.conv1 = conv_block(128, 768, kernel_size=5)
     self.conv1.stddev = 0.01  # type: ignore[assignment]
     self.fc = nn.Linear(768, num_classes)
     self.fc.stddev = 0.001  # type: ignore[assignment]
     self.avg_pool = nn.AvgPool2d(kernel_size=5, stride=3)
     self.adaptive_avp_pool = nn.AdaptiveAvgPool2d((1, 1))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: ghostnet.py Projeto: Oneflow-Inc/models

 def __init__(self,
              in_chs,
              se_ratio=0.25,
              reduced_base_chs=None,
              act_layer=nn.ReLU,
              gate_fn=hard_sigmoid,
              divisor=4,
              **_):
     super(SqueezeExcite, self).__init__()
     self.gate_fn = gate_fn
     reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio,
                                   divisor)
     self.avg_pool = nn.AdaptiveAvgPool2d(1)
     self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
     self.act1 = act_layer(inplace=True)
     self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: repvgg.py Projeto: Oneflow-Inc/models

    def __init__(
        self,
        num_blocks,
        num_classes=1000,
        width_multiplier=None,
        override_groups_map=None,
        deploy=False,
        use_se=False,
    ):
        super(RepVGG, self).__init__()

        assert len(width_multiplier) == 4

        self.deploy = deploy
        self.override_groups_map = override_groups_map or dict()
        self.use_se = use_se

        assert 0 not in self.override_groups_map

        self.in_planes = min(64, int(64 * width_multiplier[0]))

        self.stage0 = RepVGGBlock(
            in_channels=3,
            out_channels=self.in_planes,
            kernel_size=3,
            stride=2,
            padding=1,
            deploy=self.deploy,
            use_se=self.use_se,
        )
        self.cur_layer_idx = 1
        self.stage1 = self._make_stage(int(64 * width_multiplier[0]),
                                       num_blocks[0],
                                       stride=2)
        self.stage2 = self._make_stage(int(128 * width_multiplier[1]),
                                       num_blocks[1],
                                       stride=2)
        self.stage3 = self._make_stage(int(256 * width_multiplier[2]),
                                       num_blocks[2],
                                       stride=2)
        self.stage4 = self._make_stage(int(512 * width_multiplier[3]),
                                       num_blocks[3],
                                       stride=2)
        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)

Exemplo n.º 8

0

Exibir arquivo

 def __init__(self,
              features: nn.Module,
              num_classes: int = 1000,
              init_weights: bool = True) -> None:
     super(VGG, self).__init__()
     self.features = features
     self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
     self.classifier = nn.Sequential(
         nn.Linear(512 * 7 * 7, 4096),
         nn.ReLU(True),
         nn.Dropout(),
         nn.Linear(4096, 4096),
         nn.ReLU(True),
         nn.Dropout(),
         nn.Linear(4096, num_classes),
     )
     if init_weights:
         self._initialize_weights()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: repvgg.py Projeto: Oneflow-Inc/models

 def __init__(self, input_channels: int, internal_neurons: int):
     super(SEBlock, self).__init__()
     self.down = nn.Conv2d(
         in_channels=input_channels,
         out_channels=internal_neurons,
         kernel_size=1,
         stride=1,
         bias=True,
     )
     self.up = nn.Conv2d(
         in_channels=internal_neurons,
         out_channels=input_channels,
         kernel_size=1,
         stride=1,
         bias=True,
     )
     self.relu = nn.ReLU()
     self.adaptive_avg_pool2d = nn.AdaptiveAvgPool2d(output_size=1)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: model.py Projeto: Oneflow-Inc/models

    def __init__(self, num_classes):
        super(ResReid, self).__init__()
        self.in_planes = 2048
        self.model = ResNet(block=Bottleneck,
                            layers=[3, 4, 6, 3],
                            last_stride=1)
        self.gap = nn.AdaptiveAvgPool2d(1)

        self.bnneck = nn.BatchNorm1d(self.in_planes)
        self.bnneck.bias.requires_grad_(False)  # no shift
        self.num_classes = num_classes
        nn.init.normal_(self.bnneck.weight, 1, 0.02)
        nn.init.constant_(self.bnneck.bias, 0)

        self.classifier = nn.Linear(self.in_planes,
                                    self.num_classes,
                                    bias=False)

        nn.init.normal_(self.classifier.weight, 0, 0.01)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: inceptionv3.py Projeto: Oneflow-Inc/models

    def __init__(
        self,
        num_classes: int = 1000,
        aux_logits: bool = True,
        transform_input: bool = False,
        inception_blocks: Optional[List[Callable[..., nn.Module]]] = None,
        init_weights: Optional[bool] = None,
    ) -> None:
        super(Inception3, self).__init__()
        if inception_blocks is None:
            inception_blocks = [
                BasicConv2d,
                InceptionA,
                InceptionB,
                InceptionC,
                InceptionD,
                InceptionE,
                InceptionAux,
            ]
        if init_weights is None:
            warnings.warn(
                "The default weight initialization of inception_v3 will be changed in future releases of "
                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
                " due to scipy/scipy#11299), please set init_weights=True.",
                FutureWarning,
            )
            init_weights = True
        assert len(inception_blocks) == 7
        conv_block = inception_blocks[0]
        inception_a = inception_blocks[1]
        inception_b = inception_blocks[2]
        inception_c = inception_blocks[3]
        inception_d = inception_blocks[4]
        inception_e = inception_blocks[5]
        inception_aux = inception_blocks[6]

        self.aux_logits = aux_logits
        self.transform_input = transform_input
        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.Mixed_5b = inception_a(192, pool_features=32)
        self.Mixed_5c = inception_a(256, pool_features=64)
        self.Mixed_5d = inception_a(288, pool_features=64)
        self.Mixed_6a = inception_b(288)
        self.Mixed_6b = inception_c(768, channels_7x7=128)
        self.Mixed_6c = inception_c(768, channels_7x7=160)
        self.Mixed_6d = inception_c(768, channels_7x7=160)
        self.Mixed_6e = inception_c(768, channels_7x7=192)
        self.AuxLogits: Optional[nn.Module] = None
        if aux_logits:
            self.AuxLogits = inception_aux(768, num_classes)
        self.Mixed_7a = inception_d(768)
        self.Mixed_7b = inception_e(1280)
        self.Mixed_7c = inception_e(2048)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(2048, num_classes)

        if init_weights:
            for m in self.modules():
                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                    stddev = float(m.stddev) if hasattr(
                        m, "stddev") else 0.1  # type: ignore
                    flow.nn.init.trunc_normal_(m.weight,
                                               mean=0.0,
                                               std=stddev,
                                               a=-2,
                                               b=2)
                elif isinstance(m, nn.BatchNorm2d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: mobilenetv2.py Projeto: Oneflow-Inc/models

    def __init__(
        self,
        num_classes: int = 1000,
        width_mult: float = 1.0,
        inverted_residual_setting: Optional[List[List[int]]] = None,
        round_nearest: int = 8,
        block: Optional[Callable[..., nn.Module]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        """
        MobileNet V2 main class
        Args:
            num_classes (int): Number of classes
            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
            inverted_residual_setting: Network structure
            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
            Set to 1 to turn off rounding
            block: Module specifying inverted residual building block for mobilenet
            norm_layer: Module specifying the normalization layer to use
        """
        super(MobileNetV2, self).__init__()

        if block is None:
            block = InvertedResidual

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        input_channel = 32
        last_channel = 1280

        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t, c, n, s
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]

        # only check the first element, assuming user knows t,c,n,s are required
        if (len(inverted_residual_setting) == 0
                or len(inverted_residual_setting[0]) != 4):
            raise ValueError("inverted_residual_setting should be non-empty "
                             "or a 4-element list, got {}".format(
                                 inverted_residual_setting))

        # building first layer
        input_channel = _make_divisible(input_channel * width_mult,
                                        round_nearest)
        self.last_channel = _make_divisible(
            last_channel * max(1.0, width_mult), round_nearest)
        features: List[nn.Module] = [
            ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)
        ]
        # building inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c * width_mult, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(
                    block(
                        input_channel,
                        output_channel,
                        stride,
                        expand_ratio=t,
                        norm_layer=norm_layer,
                    ))
                input_channel = output_channel
        # building last several layers
        features.append(
            ConvBNReLU(input_channel,
                       self.last_channel,
                       kernel_size=1,
                       norm_layer=norm_layer))

        # make it nn.Sequential
        self.features = nn.Sequential(*features)
        self.adaptive_avg_pool2d = nn.AdaptiveAvgPool2d((1, 1))

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, num_classes),
        )

        # weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out")
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, (nn.BatchNorm2d)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: q_alexnet.py Projeto: Oneflow-Inc/models

 def quantize(
     self,
     quantization_bit=8,
     quantization_scheme="symmetric",
     quantization_formula="google",
     per_layer_quantization=True,
 ):
     self.q_features = nn.Sequential(
         q_conv(
             self.features[0],
             qi=True,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         q_conv(
             self.features[3],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
         q_conv(
             self.features[6],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         q_conv(
             self.features[8],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         q_conv(
             self.features[10],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=3, stride=2),
     )
     self.q_avgpool = nn.AdaptiveAvgPool2d((6, 6))
     self.q_classifier = nn.Sequential(
         nn.Dropout(),
         q_linear(
             self.classifier[1],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         nn.Dropout(),
         q_linear(
             self.classifier[4],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
         nn.ReLU(inplace=True),
         q_linear(
             self.classifier[6],
             qi=False,
             qo=True,
             quantization_bit=quantization_bit,
             quantization_scheme=quantization_scheme,
             quantization_formula=quantization_formula,
             per_layer_quantization=per_layer_quantization,
         ),
     )

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_resnet101_benchmark.py Projeto: ZJLabDubhe/oneflow-zj

    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        num_classes: int = 1000,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError(
                "replace_stride_with_dilation should be None "
                "or a 3-element tuple, got {}".format(replace_stride_with_dilation)
            )
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(
            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
        )
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(
            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
        )
        self.layer3 = self._make_layer(
            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
        )
        self.layer4 = self._make_layer(
            block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]

Exemplo n.º 15

0

Exibir arquivo

    def __init__(self,
                 inverted_residual_setting: List[InvertedResidualConfig],
                 last_channel: int,
                 num_classes: int = 1000,
                 block: Optional[Callable[..., nn.Module]] = None,
                 norm_layer: Optional[Callable[..., nn.Module]] = None,
                 **kwargs: Any) -> None:
        """
        MobileNet V3 main class
        Args:
            inverted_residual_setting (List[InvertedResidualConfig]): Network structure
            last_channel (int): The number of channels on the penultimate layer
            num_classes (int): Number of classes
            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
        """
        super().__init__()

        if not inverted_residual_setting:
            raise ValueError(
                "The inverted_residual_setting should not be empty")
        elif not (isinstance(inverted_residual_setting, Sequence) and all([
                isinstance(s, InvertedResidualConfig)
                for s in inverted_residual_setting
        ])):
            raise TypeError(
                "The inverted_residual_setting should be List[InvertedResidualConfig]"
            )

        if block is None:
            block = InvertedResidual

        if norm_layer is None:
            norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)

        layers: List[nn.Module] = []

        # building first layer
        firstconv_output_channels = inverted_residual_setting[0].input_channels
        layers.append(
            ConvBNActivation(
                3,
                firstconv_output_channels,
                kernel_size=3,
                stride=2,
                norm_layer=norm_layer,
                activation_layer=nn.Hardswish,
            ))

        # building inverted residual blocks
        for cnf in inverted_residual_setting:
            layers.append(block(cnf, norm_layer))

        # building last several layers
        lastconv_input_channels = inverted_residual_setting[-1].out_channels
        lastconv_output_channels = 6 * lastconv_input_channels
        layers.append(
            ConvBNActivation(
                lastconv_input_channels,
                lastconv_output_channels,
                kernel_size=1,
                norm_layer=norm_layer,
                activation_layer=nn.Hardswish,
            ))

        self.features = nn.Sequential(*layers)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Linear(lastconv_output_channels, last_channel),
            nn.Hardswish(inplace=True),
            nn.Dropout(p=0.2, inplace=True),
            nn.Linear(last_channel, num_classes),
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out")
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)