示例#1
0
    def __init__(self, scale, num_classes=1000, dropout=0.2, bn=None):
        r"""
        Arguments:

        - scale (:obj:`float`): scale rate of channels
        - num_classes (:obj:`int`): number of classification classes
        - dropout (:obj:`float`): dropout rate
        - bn (:obj:`dict`): definition of batchnorm
        """
        super(MNASNet, self).__init__()

        global BN
        BN = get_bn(bn)

        assert scale > 0.0
        self.scale = scale
        self.num_classes = num_classes
        depths = _get_depths(scale)
        layers = [
            # First layer: regular conv.
            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
            BN(depths[0]),
            nn.ReLU(inplace=True),
            # Depthwise separable, no skip.
            nn.Conv2d(depths[0],
                      depths[0],
                      3,
                      padding=1,
                      stride=1,
                      groups=depths[0],
                      bias=False),
            BN(depths[0]),
            nn.ReLU(inplace=True),
            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1,
                      bias=False),
            BN(depths[1]),
            # MNASNet blocks: stacks of inverted residuals.
            _stack(depths[1], depths[2], 3, 2, 3, 3),
            _stack(depths[2], depths[3], 5, 2, 3, 3),
            _stack(depths[3], depths[4], 5, 2, 6, 3),
            _stack(depths[4], depths[5], 3, 1, 6, 2),
            _stack(depths[5], depths[6], 5, 2, 6, 4),
            _stack(depths[6], depths[7], 3, 1, 6, 1),
            # Final mapping to classifier input.
            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
            BN(1280),
            nn.ReLU(inplace=True),
        ]
        self.layers = nn.Sequential(*layers)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
                                        nn.Linear(1280, num_classes))
        self._initialize_weights()
示例#2
0
    def __init__(self,
                 cfg,
                 num_classes=1000,
                 scale=1.0,
                 bn=None):
        # Generate RegNet ws per block
        b_ws, num_s, _, _ = generate_regnet(
            cfg['WA'], cfg['W0'], cfg['WM'], cfg['DEPTH']
        )
        # Convert to per stage format
        # ws: channel list for stages, ds: number of blocks list
        ws, ds = get_stages_from_blocks(b_ws, b_ws)
        # scale-up/down channels
        ws = [int(_w * scale) for _w in ws]
        # Generate group widths and bot muls
        gws = [cfg['GROUP_W'] for _ in range(num_s)]
        bms = [1 for _ in range(num_s)]
        # Adjust the compatibility of ws and gws
        ws, gws = adjust_ws_gs_comp(ws, bms, gws)
        # Use the same stride for each stage, stride set to 2
        ss = [2 for _ in range(num_s)]
        # Use SE for RegNetY
        se_r = 0.25 if cfg['SE_ON'] else None
        # Construct the model
        STEM_W = int(32 * scale)

        global BN

        BN = get_bn(bn)

        kwargs = {
            "stem_w": STEM_W,
            "ss": ss,
            "ds": ds,
            "ws": ws,
            "bms": bms,
            "gws": gws,
            "se_r": se_r,
            "nc": num_classes,
        }
        super(RegNet, self).__init__(**kwargs)
示例#3
0
    def __init__(self,
                 block,
                 layers,
                 num_classes=1000,
                 ibn_ratio=0.5,
                 bn=None):
        scale = 64
        self.inplanes = scale
        self.ibn_ratio = ibn_ratio
        super(ResNetIBN, self).__init__()

        global BN
        BN = get_bn(bn)

        self.conv1 = nn.Conv2d(3,
                               scale,
                               kernel_size=7,
                               stride=2,
                               padding=3,
                               bias=False)
        self.bn1 = BN(scale)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, scale, layers[0])
        self.layer2 = self._make_layer(block, scale * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(block, scale * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(block, scale * 8, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(scale * 8 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, (nn.BatchNorm2d, SyncBatchNorm2d)):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.InstanceNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
示例#4
0
    def __init__(self,
                 blocks_args=None,
                 global_params=None,
                 use_fc_bn=False,
                 fc_bn_init_scale=1.0,
                 bn=None):
        super(EfficientNet, self).__init__()

        global BN

        BN = get_bn(bn)

        if not isinstance(blocks_args, list):
            raise ValueError('blocks_args should be a list.')

        self.logger = get_logger(__name__)

        self._global_params = global_params
        self._blocks_args = blocks_args
        self.use_fc_bn = use_fc_bn
        self.fc_bn_init_scale = fc_bn_init_scale

        self._build()
示例#5
0
    def __init__(self,
                 block,
                 layers,
                 num_classes=1000,
                 deep_stem=False,
                 avg_down=False,
                 bypass_last_bn=False,
                 bn=None):
        r"""
        Arguments:

        - layers (:obj:`list` of 4 ints): how many layers in each stage
        - num_classes (:obj:`int`): number of classification classes
        - deep_stem (:obj:`bool`): whether to use deep_stem as the first conv
        - avg_down (:obj:`bool`): whether to use avg_down when spatial downsample
        - bypass_last_bn (:obj:`bool`): whether use bypass_last_bn
        - bn (:obj:`dict`): definition of batchnorm
        """

        super(PreactResNet, self).__init__()

        logger = get_logger(__name__)

        global BN, bypass_bn_weight_list

        BN = get_bn(bn)
        bypass_bn_weight_list = []

        self.inplanes = 64
        self.deep_stem = deep_stem
        self.avg_down = avg_down
        self.logger = get_logger(__name__)

        if self.deep_stem:
            self.conv1 = nn.Sequential(
                nn.Conv2d(3,
                          32,
                          kernel_size=3,
                          stride=2,
                          padding=1,
                          bias=False),
                BN(32),
                nn.ReLU(inplace=True),
                nn.Conv2d(32,
                          32,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False),
                BN(32),
                nn.ReLU(inplace=True),
                nn.Conv2d(32,
                          64,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False),
            )
        else:
            self.conv1 = nn.Conv2d(3,
                                   64,
                                   kernel_size=7,
                                   stride=2,
                                   padding=3,
                                   bias=False)
        self.bn1 = BN(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.final_bn = BN(512 * block.expansion)
        self.final_relu = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif (isinstance(m, SyncBatchNorm2d)
                  or isinstance(m, nn.BatchNorm2d)):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 1.0 / float(n))
                m.bias.data.zero_()

        if bypass_last_bn:
            for param in bypass_bn_weight_list:
                param.data.zero_()
            logger.info('bypass {} bn.weight in BottleneckBlocks'.format(
                len(bypass_bn_weight_list)))
示例#6
0
    def __init__(self,
                 block,
                 layers,
                 groups,
                 reduction,
                 dropout_p=0.2,
                 inplanes=128,
                 input_3x3=True,
                 downsample_kernel_size=3,
                 downsample_padding=1,
                 num_classes=1000,
                 bn=None):
        """
        Arguments:

        block (:obj:`nn.Module`): Bottleneck class.
            - For SENet154: SEBottleneck
            - For SE-ResNet models: SEResNetBottleneck
            - For SE-ResNeXt models:  SEResNeXtBottleneck

        layers (:obj:`list` of :obj:`ints`): Number of residual blocks for 4 layers of the
            - network (layer1...layer4).

        groups (:obj:`int`): Number of groups for the 3x3 convolution in each
            bottleneck block.
            - For SENet154: 64
            - For SE-ResNet models: 1
            - For SE-ResNeXt models:  32

        reduction (:obj:`int`): Reduction ratio for Squeeze-and-Excitation modules.
            - For all models: 16

        dropout_p (:obj:`float` or :obj:`None`): Drop probability for the Dropout layer.
            If `None` the Dropout layer is not used.
            - For SENet154: 0.2
            - For SE-ResNet models: None
            - For SE-ResNeXt models: None

        inplanes (:obj:`int`):  Number of input channels for layer1.
            - For SENet154: 128
            - For SE-ResNet models: 64
            - For SE-ResNeXt models: 64

        input_3x3 (:obj:`bool`): If `True`, use three 3x3 convolutions instead of
            a single 7x7 convolution in layer0.
            - For SENet154: True
            - For SE-ResNet models: False
            - For SE-ResNeXt models: False

        downsample_kernel_size (:obj:`int`): Kernel size for downsampling convolutions
            in layer2, layer3 and layer4.
            - For SENet154: 3
            - For SE-ResNet models: 1
            - For SE-ResNeXt models: 1

        downsample_padding (:obj:`int`): Padding for downsampling convolutions in
            layer2, layer3 and layer4.
            - For SENet154: 1
            - For SE-ResNet models: 0
            - For SE-ResNeXt models: 0

        num_classes (:obj:`int`): Number of outputs in `last_linear` layer.
            - For all models: 1000
        """
        super(SENet, self).__init__()
        self.inplanes = inplanes

        global BN

        BN = get_bn(bn)

        if input_3x3:
            layer0_modules = [
                ('conv1', nn.Conv2d(3, 64, 3, stride=2, padding=1,
                                    bias=False)),
                ('bn1', BN(64)),
                ('relu1', nn.ReLU(inplace=True)),
                ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1,
                                    bias=False)),
                ('bn2', BN(64)),
                ('relu2', nn.ReLU(inplace=True)),
                ('conv3',
                 nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
                ('bn3', BN(inplanes)),
                ('relu3', nn.ReLU(inplace=True)),
            ]
        else:
            layer0_modules = [
                ('conv1',
                 nn.Conv2d(3,
                           inplanes,
                           kernel_size=7,
                           stride=2,
                           padding=3,
                           bias=False)),
                ('bn1', BN(inplanes)),
                ('relu1', nn.ReLU(inplace=True)),
            ]
        # To preserve compatibility with Caffe weights `ceil_mode=True`
        # is used instead of `padding=1`.
        layer0_modules.append(('pool', nn.MaxPool2d(3,
                                                    stride=2,
                                                    ceil_mode=True)))
        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
        self.layer1 = self._make_layer(block,
                                       planes=64,
                                       blocks=layers[0],
                                       groups=groups,
                                       reduction=reduction,
                                       downsample_kernel_size=1,
                                       downsample_padding=0)
        self.layer2 = self._make_layer(
            block,
            planes=128,
            blocks=layers[1],
            stride=2,
            groups=groups,
            reduction=reduction,
            downsample_kernel_size=downsample_kernel_size,
            downsample_padding=downsample_padding)
        self.layer3 = self._make_layer(
            block,
            planes=256,
            blocks=layers[2],
            stride=2,
            groups=groups,
            reduction=reduction,
            downsample_kernel_size=downsample_kernel_size,
            downsample_padding=downsample_padding)
        self.layer4 = self._make_layer(
            block,
            planes=512,
            blocks=layers[3],
            stride=2,
            groups=groups,
            reduction=reduction,
            downsample_kernel_size=downsample_kernel_size,
            downsample_padding=downsample_padding)
        self.avg_pool = nn.AvgPool2d(7, stride=1)
        self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
        self.last_linear = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, SyncBatchNorm2d) or isinstance(
                    m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
示例#7
0
    def __init__(self, num_classes=1000, scale=1., bn=None):
        super(GhostNet, self).__init__()

        global BN
        BN = get_bn(bn)

        # setting of inverted residual blocks
        self.cfgs = [
            # k, t, c, SE, s
            [3, 16, 16, 0, 1],
            [3, 48, 24, 0, 2],
            [3, 72, 24, 0, 1],
            [5, 72, 40, 1, 2],
            [5, 120, 40, 1, 1],
            [3, 240, 80, 0, 2],
            [3, 200, 80, 0, 1],
            [3, 184, 80, 0, 1],
            [3, 184, 80, 0, 1],
            [3, 480, 112, 1, 1],
            [3, 672, 112, 1, 1],
            [5, 672, 160, 1, 2],
            [5, 960, 160, 0, 1],
            [5, 960, 160, 1, 1],
            [5, 960, 160, 0, 1],
            [5, 960, 160, 1, 1]
        ]

        # building first layer
        output_channel = _make_divisible(16 * scale, 4)
        layers = [
            nn.Sequential(nn.Conv2d(3, output_channel, 3, 2, 1, bias=False),
                          BN(output_channel), nn.ReLU(inplace=True))
        ]
        input_channel = output_channel

        # building inverted residual blocks
        block = GhostBottleneck
        for k, exp_size, c, use_se, s in self.cfgs:
            output_channel = _make_divisible(c * scale, 4)
            hidden_channel = _make_divisible(exp_size * scale, 4)
            layers.append(
                block(input_channel, hidden_channel, output_channel, k, s,
                      use_se))
            input_channel = output_channel
        self.features = nn.Sequential(*layers)

        # building last several layers
        output_channel = _make_divisible(exp_size * scale, 4)
        self.squeeze = nn.Sequential(
            nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=False),
            BN(output_channel),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        input_channel = output_channel

        output_channel = 1280
        self.classifier = nn.Sequential(
            nn.Linear(input_channel, output_channel, bias=False),
            nn.BatchNorm1d(output_channel),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(output_channel, num_classes),
        )

        self._initialize_weights()
示例#8
0
    def __init__(self,
                 block,
                 layers,
                 num_classes=1000,
                 bn=None,
                 channel_config=None,
                 nnie_type=True):
        r"""
        Arguments:

        - block (:obj:`nn.Module`): block type
        - layers (:obj:`list` of 4 ints): how many layers in each stage
        - num_classes (:obj:`int`): number of classification classes
        - bn (:obj:`dict`): definition of batchnorm
        - channel_config (:obj:`dict`): configurations of the pruned channels
        - nnie_type (:obj:`bool`): if ``True``, the first maxpool is set with ceil_mode=True
        """

        super(Adaptive_ResNet, self).__init__()

        global BN

        BN = get_bn(bn)
        self.inplanes = 64
        conv1_out_ch = channel_config['conv1']
        self.conv1 = nn.Conv2d(3,
                               conv1_out_ch,
                               kernel_size=7,
                               stride=2,
                               padding=3,
                               bias=False)
        self.bn1 = BN(conv1_out_ch)
        self.relu = nn.ReLU(inplace=True)
        if nnie_type:
            self.maxpool = nn.MaxPool2d(kernel_size=2,
                                        stride=2,
                                        padding=0,
                                        ceil_mode=True)
        else:
            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(
            block, 64, layers[0], bottleneck_settings=channel_config['layer1'])
        self.layer2 = self._make_layer(
            block,
            128,
            layers[1],
            stride=2,
            bottleneck_settings=channel_config['layer2'])
        self.layer3 = self._make_layer(
            block,
            256,
            layers[2],
            stride=2,
            bottleneck_settings=channel_config['layer3'])
        self.layer4 = self._make_layer(
            block,
            512,
            layers[3],
            stride=2,
            bottleneck_settings=channel_config['layer4'])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(channel_config['fc'], num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif (isinstance(m, SyncBatchNorm2d)
                  or isinstance(m, nn.BatchNorm2d)):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
示例#9
0
    def __init__(self,
                 num_classes=1000,
                 scale=1.0,
                 inverted_residual_setting=None,
                 round_nearest=8,
                 block=InvertedResidual,
                 dropout=0.2,
                 bn=None):
        r"""
        Arguments:
            - num_classes (:obj:`int`): Number of classes
            - scale (:obj:`float`): Width multiplier, adjusts number of channels in each layer by this amount
            - inverted_residual_setting: Network structure
            - round_nearest (:obj:`int`): Round the number of channels in each layer to be a multiple of this number
              Set to 1 to turn off rounding
            - block: Module specifying inverted residual building block for mobilenet
            - bn (:obj:`dict`): definition of batchnorm
        """
        super(MobileNetV2, self).__init__()

        global BN
        BN = get_bn(bn)

        if block is None:
            block = InvertedResidual
        input_channel = 32
        last_channel = 1280

        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t, c, n, s
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]

        # only check the first element, assuming user knows t,c,n,s are required
        if len(inverted_residual_setting) == 0 or len(
                inverted_residual_setting[0]) != 4:
            raise ValueError("inverted_residual_setting should be non-empty "
                             "or a 4-element list, got {}".format(
                                 inverted_residual_setting))

        # building first layer
        input_channel = _make_divisible(input_channel * scale, round_nearest)
        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
                                            round_nearest)
        features = [ConvBNReLU(3, input_channel, stride=2)]
        # building inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c * scale, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(
                    block(input_channel,
                          output_channel,
                          stride,
                          expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        features.append(
            ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
        # make it nn.Sequential
        self.features = nn.Sequential(*features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.last_channel, num_classes),
        )

        self.init_params()
示例#10
0
    def __init__(self,
                 num_classes=1000,
                 width=[8, 8, 16, 48, 224],
                 depth=[1, 2, 2, 1, 2],
                 stride_stages=[2, 2, 2, 2, 2],
                 kernel_size=[7, 3, 3, 3, 3, 3, 3, 3],
                 expand_ratio=[0, 1, 1, 1, 1, 0.5, 0.5, 0.5],
                 act_stages=['relu', 'relu', 'relu', 'relu', 'relu'],
                 dropout_rate=0.,
                 bn=None):
        r"""
        Arguments:

        - num_classes (:obj:`int`): number of classification classes
        - width (:obj:`list` of 5 (stages+1) ints): channel list
        - depth (:obj:`list` of 5 (stages+1) ints): depth list for stages
        - stride_stages (:obj:`list` of 5 (stages+1) ints): stride list for stages
        - kernel_size (:obj:`list` of 8 (blocks+1) ints): kernel size list for blocks
        - expand_ratio (:obj:`list` of 8 (blocks+1) ints): expand ratio list for blocks
        - act_stages(:obj:`list` of 8 (blocks+1) ints): activation list for blocks
        - dropout_rate (:obj:`float`): dropout rate
        - bn (:obj:`dict`): definition of batchnorm
        """

        super(BigNAS_ResNet_Basic, self).__init__()

        global BN

        BN = get_bn(bn)

        self.depth = depth
        self.width = width
        self.kernel_size = get_same_length(kernel_size, self.depth)
        self.expand_ratio = get_same_length(expand_ratio, self.depth)

        self.dropout_rate = dropout_rate

        # first conv layer
        self.first_conv = ConvBlock(
            in_channel=3, out_channel=self.width[0], kernel_size=self.kernel_size[0],
            stride=stride_stages[0], act_func=act_stages[0])

        blocks = []
        _block_index = 0
        input_channel = self.width[0]

        stage_num = 1
        for s, act_func, n_block, output_channel in zip(stride_stages[1:], act_stages[1:], self.depth[1:],
                                                        self.width[1:]):
            _block_index += n_block
            kernel_size = self.kernel_size[_block_index]
            expand_ratio = self.expand_ratio[_block_index]
            stage_num += 1
            for i in range(n_block):
                if i == 0:
                    stride = s
                else:
                    stride = 1
                basic_block = BasicBlock(
                        in_channel=input_channel, out_channel=output_channel, kernel_size=kernel_size,
                        expand_ratio=expand_ratio, stride=stride, act_func=act_func)
                blocks.append(basic_block)
                input_channel = output_channel

        self.blocks = nn.ModuleList(blocks)
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size=1)

        self.classifier = LinearBlock(
            in_features=self.width[-1], out_features=num_classes, bias=True, dropout_rate=dropout_rate)

        self.init_model()
示例#11
0
    def __init__(self,
                 block,
                 layers,
                 inplanes=64,
                 num_classes=1000,
                 zero_init_residual=False,
                 groups=1,
                 width_per_group=64,
                 replace_stride_with_dilation=None,
                 norm_layer=None,
                 deep_stem=False,
                 avg_down=False,
                 freeze_layer=False,
                 bn=None):

        super(ResNet, self).__init__()

        global BN
        self.logger = get_logger(__name__)

        if norm_layer is None:
            BN = get_bn(bn)
            norm_layer = BN
        else:
            norm_layer = get_norm_layer(norm_layer)

        self._norm_layer = norm_layer

        self.inplanes = inplanes
        self.dilation = 1
        self.deep_stem = deep_stem
        self.avg_down = avg_down
        self.num_classes = num_classes
        self.freeze_layer = freeze_layer

        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(
                                 replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        if self.deep_stem:
            self.conv1 = nn.Sequential(
                nn.Conv2d(3,
                          inplanes // 2,
                          kernel_size=3,
                          stride=2,
                          padding=1,
                          bias=False),
                norm_layer(inplanes // 2),
                nn.ReLU(inplace=True),
                nn.Conv2d(inplanes // 2,
                          inplanes // 2,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False),
                norm_layer(inplanes // 2),
                nn.ReLU(inplace=True),
                nn.Conv2d(inplanes // 2,
                          inplanes,
                          kernel_size=3,
                          stride=1,
                          padding=1,
                          bias=False),
            )
        else:
            self.conv1 = nn.Conv2d(3,
                                   inplanes,
                                   kernel_size=7,
                                   stride=2,
                                   padding=3,
                                   bias=False)

        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block,
                                       128,
                                       layers[1],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block,
                                       256,
                                       layers[2],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block,
                                       512,
                                       layers[3],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m,
                            (nn.BatchNorm2d, nn.GroupNorm, SyncBatchNorm2d)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
示例#12
0
    def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
                 num_classes=1000, dilated=False, dilation=1,
                 deep_stem=False, stem_width=64, avg_down=False,
                 rectified_conv=False, rectify_avg=False,
                 avd=False, avd_first=False,
                 final_drop=0.0, dropblock_prob=0,
                 last_gamma=False, norm_layer=None, bn=None):
        """
        Arguments:

        - block (:obj:`Block`): Class for the residual block. Options are BasicBlockV1, BottleneckV1
        - layers (:obj:`list` of :obj:`int`): Numbers of layers in each block
        - classes (:obj:`int`, default 1000): Number of classification classes
        - dilated (:obj:`bool`, default False): Applying dilation strategy to \
            pretrained ResNet yielding a stride-8 model. \
            typically used in Semantic Segmentation
        - norm_layer (:obj:`object`): Normalization layer used in backbone network \
            (default: :class:`mxnet.gluon.nn.BatchNorm`. \
            for Synchronized Cross-GPU BachNormalization)
        """
        self.cardinality = groups
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width*2 if deep_stem else 64
        self.avg_down = avg_down
        self.last_gamma = last_gamma
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        super(ResNeSt, self).__init__()
        self.rectified_conv = rectified_conv
        self.rectify_avg = rectify_avg

        global BN
        if norm_layer is None:
            BN = get_bn(bn)
            norm_layer = BN

        if rectified_conv:
            from rfconv import RFConv2d
            conv_layer = RFConv2d
        else:
            conv_layer = nn.Conv2d
        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
        if deep_stem:
            self.conv1 = nn.Sequential(
                conv_layer(3, stem_width, kernel_size=3, stride=2,
                           padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width, kernel_size=3,
                           stride=1, padding=1, bias=False, **conv_kwargs),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(stem_width, stem_width*2, kernel_size=3,
                           stride=1, padding=1, bias=False, **conv_kwargs),
            )
        else:
            self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
                                    bias=False, **conv_kwargs)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(
            block, 64, layers[0], norm_layer=norm_layer, is_first=False)
        self.layer2 = self._make_layer(
            block, 128, layers[1], stride=2, norm_layer=norm_layer)
        if dilated or dilation == 4:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=4, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        elif dilation == 2:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           dilation=1, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
                                           dilation=2, norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                           norm_layer=norm_layer,
                                           dropblock_prob=dropblock_prob)
        self.avgpool = GlobalAvgPool2d()
        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, SyncBatchNorm2d) or isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
    def __init__(self,
                 input_size=224,
                 num_classes=1000,
                 model_size="1.5x",
                 bn=None):
        super(ShuffleNetV2, self).__init__()

        self.stage_repeats = [4, 8, 4]
        self.model_size = model_size
        r"""The number of channels are slightly reduced to
            make WeightNet's FLOPs comparable to shufflenet baselines.
        """
        if model_size == "0.5x":
            self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
        elif model_size == "1.0x":
            self.stage_out_channels = [-1, 24, 112, 224, 448, 1024]
        elif model_size == "1.5x":
            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
        elif model_size == "2.0x":
            self.stage_out_channels = [-1, 24, 248, 496, 992, 1024]
        else:
            raise NotImplementedError

        global BN

        BN = get_bn(bn)

        # building first layer
        input_channel = self.stage_out_channels[1]
        self.first_conv = nn.Sequential(
            nn.Conv2d(3, input_channel, 3, 2, 1, bias=True),
            BN(input_channel),
            nn.ReLU(),
        )

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.features = []
        for idxstage in range(len(self.stage_repeats)):
            numrepeat = self.stage_repeats[idxstage]
            output_channel = self.stage_out_channels[idxstage + 2]

            for i in range(numrepeat):
                if i == 0:
                    self.features.append(
                        ShuffleV2Block(
                            input_channel,
                            output_channel,
                            mid_channels=output_channel // 2,
                            ksize=3,
                            stride=2,
                        ))
                else:
                    self.features.append(
                        ShuffleV2Block(
                            input_channel // 2,
                            output_channel,
                            mid_channels=output_channel // 2,
                            ksize=3,
                            stride=1,
                        ))

                input_channel = output_channel

        self.features = nn.Sequential(*self.features)

        self.conv_last = nn.Sequential(
            nn.Conv2d(input_channel,
                      self.stage_out_channels[-1],
                      1,
                      1,
                      0,
                      bias=True),
            BN(self.stage_out_channels[-1]),
            nn.ReLU(),
        )
        self.globalpool = nn.AvgPool2d(7)
        if self.model_size == "2.0x":
            self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Sequential(
            nn.Linear(self.stage_out_channels[-1], num_classes, bias=True))
        self._initialize_weights()
示例#14
0
    def __init__(self,
                 num_classes=1000,
                 scale=1.0,
                 identity_tensor_multiplier=1.0,
                 sand_glass_setting=None,
                 round_nearest=8,
                 block=None,
                 dropout=0.0,
                 bn=None):
        """
        MobileNeXt main class
        Args:
            num_classes (int): Number of classes
            scale (float): Width multiplier - adjusts number of channels in each layer by this amount
            identity_tensor_multiplier(float): Identity tensor multiplier - reduce the number
            of element-wise additions in each block
            sand_glass_setting: Network structure
            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
            Set to 1 to turn off rounding
            block: Module specifying inverted residual building block for mobilenet
            bn: Module specifying the normalization layer to use
        """
        super(MobileNeXt, self).__init__()

        global BN
        BN = get_bn(bn)

        if block is None:
            block = SandGlass
        input_channel = 32
        last_channel = 1280

        # building first layer
        input_channel = _make_divisible(input_channel * scale, round_nearest)
        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
                                            round_nearest)
        features = [ConvBNReLU(3, input_channel, stride=2)]

        if sand_glass_setting is None:
            sand_glass_setting = [
                # t, c,  b, s
                [2, 96, 1, 2],
                [6, 144, 1, 1],
                [6, 192, 3, 2],
                [6, 288, 3, 2],
                [6, 384, 4, 1],
                [6, 576, 4, 2],
                [6, 960, 2, 1],
                [6, self.last_channel / scale, 1, 1],
            ]

        # only check the first element, assuming user knows t,c,n,s are required
        if len(sand_glass_setting) == 0 or len(sand_glass_setting[0]) != 4:
            raise ValueError(
                "sand_glass_setting should be non-empty "
                "or a 4-element list, got {}".format(sand_glass_setting))

        # building sand glass blocks
        for t, c, b, s in sand_glass_setting:
            output_channel = _make_divisible(c * scale, round_nearest)
            for i in range(b):
                stride = s if i == 0 else 1
                features.append(
                    block(
                        input_channel,
                        output_channel,
                        stride,
                        expand_ratio=t,
                        identity_tensor_multiplier=identity_tensor_multiplier))
                input_channel = output_channel

        # building last several layers
        # features.append(ConvBNReLU(nput_channel, self.last_channel, kernel_size=1))
        # make it nn.Sequential
        self.features = nn.Sequential(*features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.last_channel, num_classes),
        )

        self.init_params()
示例#15
0
    def __init__(self,
                 num_classes=1000,
                 scale=1.0,
                 dropout=0.8,
                 round_nearest=8,
                 mode='small',
                 bn=None):
        r"""
        Arguments:
            - num_classes (:obj:`int`): Number of classes
            - scale (:obj:`float`): Width multiplier, adjusts number of channels in each layer by this amount
            - dropout (:obj:`float`): Dropout rate
            - round_nearest (:obj:`int`): Round the number of channels in each layer to be a multiple of this number
              Set to 1 to turn off rounding
            - mode (:obj:`string`): model type, 'samll' or 'large'
            - bn (:obj:`dict`): definition of batchnorm
        """
        super(MobileNetV3, self).__init__()

        global BN
        BN = get_bn(bn)

        input_channel = 16
        last_channel = 1280
        if mode == 'large':
            mobile_setting = [
                [3, 16, 16, False, 'RE', 1],
                [3, 64, 24, False, 'RE', 2],
                [3, 72, 24, False, 'RE', 1],
                [5, 72, 40, True, 'RE', 2],
                [5, 120, 40, True, 'RE', 1],
                [5, 120, 40, True, 'RE', 1],
                [3, 240, 80, False, 'HS', 2],
                [3, 200, 80, False, 'HS', 1],
                [3, 184, 80, False, 'HS', 1],
                [3, 184, 80, False, 'HS', 1],
                [3, 480, 112, True, 'HS', 1],
                [3, 672, 112, True, 'HS', 1],
                [5, 672, 160, True, 'HS', 2],
                [5, 960, 160, True, 'HS', 1],
                [5, 960, 160, True, 'HS', 1],
            ]
        elif mode == 'small':
            mobile_setting = [
                [3, 16, 16, True, 'RE', 2],
                [3, 72, 24, False, 'RE', 2],
                [3, 88, 24, False, 'RE', 1],
                [5, 96, 40, True, 'HS', 2],
                [5, 240, 40, True, 'HS', 1],
                [5, 240, 40, True, 'HS', 1],
                [5, 120, 48, True, 'HS', 1],
                [5, 144, 48, True, 'HS', 1],
                [5, 288, 96, True, 'HS', 2],
                [5, 576, 96, True, 'HS', 1],
                [5, 576, 96, True, 'HS', 1],
            ]
        else:
            raise NotImplementedError

        # building first layer
        last_channel = _make_divisible(
            last_channel *
            scale, round_nearest) if scale > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2, activation=Hswish)]
        self.classifier = []

        # building mobile blocks
        for k, exp, c, se, nl, s in mobile_setting:
            output_channel = _make_divisible(c * scale, round_nearest)
            exp_channel = _make_divisible(exp * scale, round_nearest)
            self.features.append(
                InvertedResidual(input_channel, output_channel, k, s,
                                 exp_channel, se, nl))
            input_channel = output_channel

        # building last several layers
        if mode == 'large':
            last_conv = _make_divisible(960 * scale, round_nearest)
            self.features.append(
                conv_1x1_bn(input_channel, last_conv, activation=Hswish))
            self.features.append(nn.AdaptiveAvgPool2d(1))
            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
            self.features.append(Hswish(inplace=True))
        elif mode == 'small':
            last_conv = _make_divisible(576 * scale, round_nearest)
            self.features.append(
                conv_1x1_bn(input_channel, last_conv, activation=Hswish))
            self.features.append(nn.AdaptiveAvgPool2d(1))
            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
            self.features.append(Hswish(inplace=True))
        else:
            raise NotImplementedError

        self.features = nn.Sequential(*self.features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(last_channel, num_classes),
        )

        self.init_params()
示例#16
0
    def __init__(self,
                 stages_repeats,
                 stages_out_channels,
                 num_classes=1000,
                 bn=None):
        r"""
        - stages_repeats (:obj:`list` of 3 ints): how many layers in each stage
        - stages_out_channels (:obj:`list` of 5 ints): output channels
        - num_classes (:obj:`int`): number of classification classes
        - bn (:obj:`dict`): definition of batchnorm
        """
        super(ShuffleNetV2, self).__init__()

        if len(stages_repeats) != 3:
            raise ValueError(
                'expected stages_repeats as list of 3 positive ints')
        if len(stages_out_channels) != 5:
            raise ValueError(
                'expected stages_out_channels as list of 5 positive ints')
        self._stage_out_channels = stages_out_channels

        global BN

        BN = get_bn(bn)

        input_channels = 3
        output_channels = self._stage_out_channels[0]
        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
            BN(output_channels),
            nn.ReLU(inplace=True),
        )
        input_channels = output_channels

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
        for name, repeats, output_channels in zip(
                stage_names, stages_repeats, self._stage_out_channels[1:]):
            seq = [InvertedResidual(input_channels, output_channels, 2)]
            for i in range(repeats - 1):
                seq.append(
                    InvertedResidual(output_channels, output_channels, 1))
            setattr(self, name, nn.Sequential(*seq))
            input_channels = output_channels

        output_channels = self._stage_out_channels[-1]
        self.conv5 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
            BN(output_channels),
            nn.ReLU(inplace=True),
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(output_channels, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif (isinstance(m, SyncBatchNorm2d)
                  or isinstance(m, nn.BatchNorm2d)):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 1.0 / float(n))
                m.bias.data.zero_()
示例#17
0
    def __init__(self, stages, bn=None):
        super(HighResolutionNet, self).__init__()

        global BN

        BN = get_bn(bn)

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn1 = BN(64)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn2 = BN(64)
        self.relu = nn.ReLU(inplace=True)

        self.stage1_cfg = stages['STAGE1']
        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
        block = blocks_dict[self.stage1_cfg['BLOCK']]
        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
        stage1_out_channel = block.expansion*num_channels

        self.stage2_cfg = stages['STAGE2']
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition1 = self._make_transition_layer(
            [stage1_out_channel], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)

        self.stage3_cfg = stages['STAGE3']
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)

        self.stage4_cfg = stages['STAGE4']
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)

        # Classification Head
        self.incre_modules, self.downsamp_modules, \
            self.final_layer = self._make_head(pre_stage_channels)

        self.classifier = nn.Linear(2048, 1000)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif (isinstance(m, SyncBatchNorm2d) or isinstance(m, nn.BatchNorm2d)):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 1.0/float(n))
                m.bias.data.zero_()
    def __init__(self,
                 num_classes=1000,
                 scale=1.0,
                 inverted_residual_setting=None,
                 round_nearest=8,
                 block=InvertedResidual,
                 dropout=0.2,
                 bn=None,
                 num_experts=1,
                 final_condconv=False,
                 fc_condconv=False,
                 combine_kernel=False):

        super(MobileNetV2CondConv, self).__init__()

        global BN
        BN = get_bn(bn)
        self.logger = get_logger(__name__)

        self.fc_condconv = fc_condconv
        self.logger.info('Number of experts is {}'.format(num_experts))
        self.logger.info(
            'Replace finalconv with CondConv: {}'.format(final_condconv))
        self.logger.info('Replace fc with CondConv: {}'.format(fc_condconv))
        self.logger.info(
            'Combine kernels to implement CondConv: {}'.format(combine_kernel))

        if block is None:
            block = InvertedResidual
        input_channel = 32
        last_channel = 1280

        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t, c, n, s
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]

        # only check the first element, assuming user knows t,c,n,s are required
        if len(inverted_residual_setting) == 0 or len(
                inverted_residual_setting[0]) != 4:
            raise ValueError("inverted_residual_setting should be non-empty "
                             "or a 4-element list, got {}".format(
                                 inverted_residual_setting))

        # building first layer
        input_channel = _make_divisible(input_channel * scale, round_nearest)
        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
                                            round_nearest)
        features = [ConvBNReLU(3, input_channel, stride=2)]
        # building inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c * scale, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(
                    block(input_channel,
                          output_channel,
                          stride,
                          expand_ratio=t,
                          num_experts=num_experts,
                          combine_kernel=combine_kernel))
                input_channel = output_channel
        # building last several layers
        if final_condconv:
            features.append(
                CondConvBNReLU(input_channel,
                               self.last_channel,
                               kernel_size=1,
                               num_experts=num_experts,
                               combine_kernel=combine_kernel))
        else:
            features.append(
                ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
        # make it nn.Sequential
        self.features = nn.Sequential(*features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        # building classifier
        if fc_condconv:
            # change kernel_size to the size of feature maps
            self.dropout = nn.Dropout(0.2)
            self.classifier = CondConv2d(self.last_channel,
                                         num_classes,
                                         kernel_size=1,
                                         bias=False,
                                         num_experts=num_experts,
                                         combine_kernel=combine_kernel)
            self.classifier_router = BasicRouter(self.last_channel,
                                                 num_experts)
        else:
            self.classifier = nn.Sequential(
                nn.Dropout(0.2),
                nn.Linear(self.last_channel, num_classes),
            )

        # weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d) or isinstance(
                    m, link.nn.SyncBatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)