def __init__(self, prev_channels, current_channels, norm_layer=ABN, norm_act="relu"): super().__init__() transition_layers = [] for prev_ch, curr_ch in zip(prev_channels, current_channels): if prev_ch != curr_ch: # this case only happens between 1st and 2nd stage layers = [ conv3x3(prev_ch, curr_ch), norm_layer(curr_ch, activation=norm_act) ] transition_layers.append(nn.Sequential(*layers)) else: transition_layers.append(nn.Identity()) if len(current_channels) > len( prev_channels): # only works for ONE extra branch layers = [ conv3x3(prev_channels[-1], current_channels[-1], 2), norm_layer(current_channels[-1], activation=norm_act) ] transition_layers.append(nn.Sequential(*layers)) self.trans_layers = nn.ModuleList(transition_layers)
def _make_stem(self, stem_type, stem_width, in_channels, norm_layer, norm_act): assert stem_type in {"", "deep", "space2depth" }, f"Stem type {stem_type} is not supported" if stem_type == "space2depth": # in the paper they use conv1x1 but in code conv3x3 (which seems better) self.conv1 = nn.Sequential(SpaceToDepth(), conv3x3(in_channels * 16, stem_width)) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.Identity( ) # not used but needed for code compatability else: if stem_type == "deep": self.conv1 = nn.Sequential( conv3x3(in_channels, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width // 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width), ) else: self.conv1 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def __init__( self, pretrained="coco", # not used here for proper signature encoder_name="resnet50", encoder_weights="imagenet", pyramid_channels=256, num_classes=80, # drop_connect_rate=0, # TODO: add encoder_norm_layer="abn", encoder_norm_act="relu", decoder_norm_layer="none", # None by default to match detectron & mmdet versions decoder_norm_act="relu", **encoder_params, ): super().__init__() self.encoder = get_encoder( encoder_name, norm_layer=encoder_norm_layer, norm_act=encoder_norm_act, encoder_weights=encoder_weights, **encoder_params, ) norm_layer = bn_from_name(decoder_norm_layer) self.pyramid6 = nn.Sequential( conv3x3(self.encoder.out_shapes[0], pyramid_channels, 2, bias=True), norm_layer(pyramid_channels, activation="identity"), ) self.pyramid7 = nn.Sequential( conv3x3(pyramid_channels, pyramid_channels, 2, bias=True), norm_layer(pyramid_channels, activation="identity"), ) self.fpn = FPN(self.encoder.out_shapes[:-2], pyramid_channels=pyramid_channels) def make_final_convs(): layers = [] for _ in range(4): layers += [ conv3x3(pyramid_channels, pyramid_channels, bias=True) ] # Norm here is fine for GroupNorm but for BN it should be implemented the other way # see EffDet for example. Maybe need to change this implementation to align with EffDet layers += [ norm_layer(pyramid_channels, activation=decoder_norm_act) ] return nn.Sequential(*layers) anchors_per_location = 9 self.cls_convs = make_final_convs() self.cls_head_conv = conv3x3(pyramid_channels, num_classes * anchors_per_location, bias=True) self.box_convs = make_final_convs() self.box_head_conv = conv3x3(pyramid_channels, 4 * anchors_per_location, bias=True) self.num_classes = num_classes self._initialize_weights()
def __init__(self, pre_channels, norm_layer=ABN, norm_act="relu"): super().__init__() head_block = Bottleneck head_channels = [32, 64, 128, 256] # Increasing the #channels on each resolution # from C, 2C, 4C, 8C to 128, 256, 512, 1024 incre_modules = [] for (pre_c, head_c) in zip(pre_channels, head_channels): incre_modules.append( make_layer(pre_c, head_c, 1, norm_layer, norm_act)) self.incre_modules = nn.ModuleList(incre_modules) # downsampling modules downsamp_modules = [] for i in range(len(pre_channels) - 1): in_ch = head_channels[i] * head_block.expansion out_ch = head_channels[i + 1] * head_block.expansion downsamp_module = nn.Sequential( conv3x3(in_ch, out_ch, 2, bias=True), norm_layer(out_ch, activation=norm_act)) downsamp_modules.append(downsamp_module) self.downsamp_modules = nn.ModuleList(downsamp_modules) self.final_layer = nn.Sequential( conv1x1(head_channels[3] * head_block.expansion, 2048, bias=True), norm_layer(2048, activation=norm_act), )
def __init__( self, encoder_name="resnet34", encoder_weights="imagenet", pyramid_channels=256, num_classes=80, norm_layer="abn", norm_act="relu", **encoder_params, ): super().__init__() self.encoder = get_encoder( encoder_name, norm_layer=norm_layer, norm_act=norm_act, encoder_weights=encoder_weights, **encoder_params, ) norm_layer = bn_from_name(norm_layer) self.pyramid6 = conv3x3(256, 256, 2, bias=True) self.pyramid7 = conv3x3(256, 256, 2, bias=True) self.fpn = FPN( self.encoder.out_shapes[:-2], pyramid_channels=pyramid_channels, ) def make_head(out_size): layers = [] for _ in range(4): # some implementations don't use BN here but I think it's needed # TODO: test how it affects results layers += [ nn.Conv2d(256, 256, 3, padding=1), norm_layer(256, activation=norm_act) ] # layers += [nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()] layers += [nn.Conv2d(256, out_size, 3, padding=1)] return nn.Sequential(*layers) self.ratios = [1.0, 2.0, 0.5] self.scales = [4 * 2**(i / 3) for i in range(3)] anchors = len(self.ratios) * len(self.scales) # 9 self.cls_head = make_head(num_classes * anchors) self.box_head = make_head(4 * anchors)
def _make_fuse_layers(self, norm_layer, norm_act): if self.num_branches == 1: return None num_branches = self.num_branches num_inchannels = self.num_inchannels fuse_layers = [] for i in range(num_branches): fuse_layer = [] for j in range(num_branches): if j > i: fuse_layer.append( nn.Sequential( conv1x1(num_inchannels[j], num_inchannels[i]), norm_layer(num_inchannels[i], activation="identity"), nn.Upsample(scale_factor=2**(j - i), mode='nearest'))) elif j == i: fuse_layer.append(nn.Identity()) else: conv3x3s = [] for k in range(i - j): if k == i - j - 1: num_outchannels_conv3x3 = num_inchannels[i] conv3x3s.append( nn.Sequential( conv3x3(num_inchannels[j], num_outchannels_conv3x3, 2), norm_layer(num_outchannels_conv3x3, activation="identity"))) else: num_outchannels_conv3x3 = num_inchannels[j] conv3x3s.append( nn.Sequential( conv3x3(num_inchannels[j], num_outchannels_conv3x3, 2), norm_layer(num_outchannels_conv3x3, activation=norm_act))) fuse_layer.append(nn.Sequential(*conv3x3s)) fuse_layers.append(nn.ModuleList(fuse_layer)) return nn.ModuleList(fuse_layers)
def make_final_convs(): layers = [] for _ in range(4): layers += [ conv3x3(pyramid_channels, pyramid_channels, bias=True) ] # Norm here is fine for GroupNorm but for BN it should be implemented the other way # see EffDet for example. Maybe need to change this implementation to align with EffDet layers += [ norm_layer(pyramid_channels, activation=decoder_norm_act) ] return nn.Sequential(*layers)
def _make_stem(self, stem_type, stem_width, in_channels, norm_layer, norm_act): supported_stems = {"", "deep", "space2depth", "space2depth_2"} assert stem_type in supported_stems, f"Stem type {stem_type} is not supported" if stem_type == "space2depth": # in the paper they use conv1x1 but in code conv3x3 (which seems better) self.conv1 = nn.Sequential(SpaceToDepth(block_size=4), conv3x3(in_channels * 16, stem_width)) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.Identity() # not used but needed for code compatability elif stem_type == "space2depth_2": # original S2D is ~4% faster than default. this version is 2% faster than default but can be used as encoder self.conv1 = nn.Sequential( SpaceToDepth(block_size=2), conv3x3(in_channels * 4, stem_width // 4), norm_layer(stem_width // 4, activation=norm_act), ) self.bn1 = nn.Identity() # name is confusing but it's for compatability self.maxpool = nn.Sequential( SpaceToDepth(block_size=2), conv3x3(stem_width, stem_width), norm_layer(stem_width, activation=norm_act), ) else: if stem_type == "deep": self.conv1 = nn.Sequential( conv3x3(in_channels, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width // 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width), ) else: self.conv1 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def __init__(self, in_planes, growth_rate, drop_rate=0.0, memory_efficient=False, norm_layer=ABN, norm_act='relu'): super(_DenseLayer, self).__init__() width = growth_rate * self.expansion self.norm1 = norm_layer(in_planes, activation=norm_act) self.conv1 = conv1x1(in_planes, width) self.norm2 = norm_layer(width, activation=norm_act) self.conv2 = conv3x3(width, growth_rate) self.drop_rate = drop_rate self.memory_efficient = memory_efficient
def __init__( self, block=None, layers=None, pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, use_se=False, groups=1, base_width=64, deep_stem=False, output_stride=32, norm_layer="abn", norm_act="relu", antialias=False, encoder=False, drop_rate=0.0, drop_connect_rate=0.0, global_pool="avg", init_bn0=True, ): stem_width = 64 norm_layer = bn_from_name(norm_layer) self.inplanes = stem_width self.num_classes = num_classes self.groups = groups self.base_width = base_width self.block = block self.expansion = block.expansion self.norm_act = norm_act self.block_idx = 0 self.num_blocks = sum(layers) self.drop_connect_rate = drop_connect_rate super(ResNet, self).__init__() if deep_stem: self.conv1 = nn.Sequential( conv3x3(in_channels, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width // 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width), ) else: self.conv1 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.MaxPool2d( kernel_size=3, stride=2, padding=0 if use_se else 1, ceil_mode=True if use_se else False, ) if output_stride not in [8, 16, 32]: raise ValueError("Output stride should be in [8, 16, 32]") if output_stride == 8: stride_3, stride_4, dilation_3, dilation_4 = 1, 1, 2, 4 elif output_stride == 16: stride_3, stride_4, dilation_3, dilation_4 = 2, 1, 1, 2 elif output_stride == 32: stride_3, stride_4, dilation_3, dilation_4 = 2, 2, 1, 1 largs = dict(use_se=use_se, norm_layer=norm_layer, norm_act=norm_act, antialias=antialias) self.layer1 = self._make_layer(64, layers[0], stride=1, **largs) self.layer2 = self._make_layer(128, layers[1], stride=2, **largs) self.layer3 = self._make_layer(256, layers[2], stride=stride_3, dilation=dilation_3, **largs) self.layer4 = self._make_layer(512, layers[3], stride=stride_4, dilation=dilation_4, **largs) self.global_pool = GlobalPool2d(global_pool) self.num_features = 512 * self.expansion self.encoder = encoder if not encoder: self.dropout = nn.Dropout(p=drop_rate, inplace=True) self.last_linear = nn.Linear( self.num_features * self.global_pool.feat_mult(), num_classes) else: self.forward = self.encoder_features self._initialize_weights(init_bn0)
def __init__( self, width=18, small=False, pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, norm_layer="abn", norm_act="relu", encoder=False, ): super(HighResolutionNet, self).__init__() stem_width = 64 norm_layer = bn_from_name(norm_layer) self.bn_args = bn_args = { "norm_layer": norm_layer, "norm_act": norm_act } self.conv1 = conv3x3(in_channels, stem_width, stride=2) self.bn1 = norm_layer(stem_width, activation=norm_act) self.conv2 = conv3x3(stem_width, stem_width, stride=2) self.bn2 = norm_layer(stem_width, activation=norm_act) channels = [width, width * 2, width * 4, width * 8] n_blocks = [2 if small else 4] * 4 self.layer1 = make_layer(stem_width, stem_width, n_blocks[0], **bn_args) self.transition1 = TransitionBlock([stem_width * Bottleneck.expansion], channels[:2], **bn_args) self.stage2 = self._make_stage(n_modules=1, n_branches=2, n_blocks=n_blocks[:2], n_chnls=channels[:2]) self.transition2 = TransitionBlock(channels[:2], channels[:3], **bn_args) self.stage3 = self._make_stage( # 3 if small else 4 n_modules=(4, 3)[small], n_branches=3, n_blocks=n_blocks[:3], n_chnls=channels[:3]) self.transition3 = TransitionBlock(channels[:3], channels, **bn_args) self.stage4 = self._make_stage( # 2 if small else 3 n_modules=(3, 2)[small], n_branches=4, n_blocks=n_blocks, n_chnls=channels, ) self.encoder = encoder if encoder: self.forward = self.encoder_features else: # Classification Head self.cls_head = HRClassificationHead(channels, **bn_args) self.global_pool = nn.AdaptiveAvgPool2d(1) self.last_linear = nn.Linear(2048, num_classes) # initialize weights initialize(self)
def __init__( self, block=None, layers=None, pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, use_se=False, groups=1, base_width=64, deep_stem=False, dilated=False, norm_layer='abn', norm_act='relu', antialias=False, encoder=False, drop_rate=0.0, global_pool='avg', init_bn0=True): stem_width = 64 if norm_layer.lower() == 'abn': norm_act = 'relu' norm_layer = bn_from_name(norm_layer) self.inplanes = stem_width self.num_classes = num_classes self.groups = groups self.base_width = base_width self.drop_rate = drop_rate self.block = block self.expansion = block.expansion self.dilated = dilated self.norm_act = norm_act super(ResNet, self).__init__() if deep_stem: self.conv1 = nn.Sequential( conv3x3(in_channels, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width)) else: self.conv1 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(stem_width, activation=norm_act) if deep_stem: self.maxpool = nn.Sequential() # don't need it elif antialias: self.maxpool = nn.Sequential( nn.MaxPool2d(kernel_size=3, stride=1, padding=1), BlurPool()) else: # for se resnets fist maxpool is slightly different self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0 if use_se else 1, ceil_mode=True if use_se else False) # Output stride is 8 with dilated and 32 without stride_3_4 = 1 if self.dilated else 2 dilation_3 = 2 if self.dilated else 1 dilation_4 = 4 if self.dilated else 1 largs = dict(use_se=use_se, norm_layer=norm_layer, norm_act=norm_act, antialias=antialias) self.layer1 = self._make_layer(64, layers[0], stride=1, **largs) self.layer2 = self._make_layer(128, layers[1], stride=2, **largs) self.layer3 = self._make_layer(256, layers[2], stride=stride_3_4, dilation=dilation_3, **largs) self.layer4 = self._make_layer(512, layers[3], stride=stride_3_4, dilation=dilation_4, **largs) self.global_pool = GlobalPool2d(global_pool) self.num_features = 512 * self.expansion self.encoder = encoder if not encoder: self.last_linear = nn.Linear( self.num_features * self.global_pool.feat_mult(), num_classes) else: self.forward = self.encoder_features self._initialize_weights(init_bn0)
def __init__( self, encoder_name="hrnet_w18", encoder_weights="imagenet", pretrained=None, # not used num_classes=1, last_upsample=True, OCR=False, drop_rate=0, norm_layer="inplace_abn", # use memory efficient by default norm_act="leaky_relu", **encoder_params, ): super().__init__() self.encoder = get_encoder( encoder_name, encoder_weights=encoder_weights, norm_layer=norm_layer, norm_act=norm_act, **encoder_params, ) norm_layer = bn_from_name(norm_layer) final_channels = sum(self.encoder.out_shapes[:4]) self.OCR = OCR if OCR: self.conv3x3 = nn.Sequential( conv3x3(final_channels, 512, bias=True), norm_layer(512, activation=norm_act), ) self.ocr_gather_head = SpatialOCR_Gather() self.ocr_distri_head = SpatialOCR(in_channels=512, key_channels=256, out_channels=512, norm_layer=norm_layer, norm_act=norm_act) self.head = conv1x1(512, num_classes, bias=True) self.aux_head = nn.Sequential( # in OCR first conv is 3x3 conv3x3(final_channels, final_channels, bias=True), norm_layer(final_channels, activation=norm_act), conv1x1(final_channels, num_classes, bias=True), ) else: self.head = nn.Sequential( conv1x1(final_channels, final_channels, bias=True), norm_layer(final_channels, activation=norm_act), conv1x1(final_channels, num_classes, bias=True), ) up_kwargs = dict(mode="bilinear", align_corners=True) self.up_x2 = nn.Upsample(scale_factor=2, **up_kwargs) self.up_x4 = nn.Upsample(scale_factor=4, **up_kwargs) self.up_x8 = nn.Upsample(scale_factor=8, **up_kwargs) self.last_upsample = nn.Upsample( scale_factor=4, **up_kwargs) if last_upsample else nn.Identity() self.dropout = nn.Dropout2d( drop_rate) # can't use inplace. it would raise a backprop error self.name = f"segm-{encoder_name}" # use lower momemntum patch_bn_mom(self) self._init_weights()
def __init__( self, blocks_args=None, width_multiplier=None, depth_multiplier=None, pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, output_stride=32, encoder=False, drop_rate=0, drop_connect_rate=0, stem_size=32, norm_layer="abn", norm_act="swish", match_tf_same_padding=False, ): super().__init__() norm_layer = bn_from_name(norm_layer) self.norm_layer = norm_layer self.norm_act = norm_act self.width_multiplier = width_multiplier self.depth_multiplier = depth_multiplier stem_size = make_divisible(stem_size * width_multiplier) self.conv_stem = conv3x3(in_channels, stem_size, stride=2) self.bn1 = norm_layer(stem_size, activation=norm_act) in_channels = stem_size self.blocks = nn.ModuleList([]) # modify block args to account for output_stride strategy blocks_args = _patch_block_args(blocks_args, output_stride) for block_idx, block_arg in enumerate(blocks_args): block = [] block_arg["in_channels"] = make_divisible(block_arg["in_channels"] * self.width_multiplier) block_arg["out_channels"] = make_divisible(block_arg["out_channels"] * self.width_multiplier) block_arg["norm_layer"] = norm_layer block_arg["norm_act"] = norm_act # linearly scale keep prob block_arg["keep_prob"] = 1 - drop_connect_rate * block_idx / len(blocks_args) repeats = block_arg.pop("num_repeat") repeats = int(math.ceil(repeats * self.depth_multiplier)) # when dilating conv with stride 2 we want it to have dilation // 2 # it prevents checkerboard artifacts with OS=16 and OS=8 dilation = block_arg.get("dilation", 1) # save block values if block_arg.pop("no_first_dilation", False): block_arg["dilation"] = max(1, block_arg["dilation"] // 2) block.append(InvertedResidual(**block_arg)) # only first layer in block is strided block_arg["stride"] = 1 block_arg["dilation"] = dilation block_arg["in_channels"] = block_arg["out_channels"] for _ in range(repeats - 1): block.append(InvertedResidual(**block_arg)) self.blocks.append(nn.Sequential(*block)) # Head if encoder: self.forward = self.encoder_features else: out_channels = block_arg["out_channels"] num_features = make_divisible(1280 * width_multiplier) self.conv_head = conv1x1(out_channels, num_features) self.bn2 = norm_layer(num_features, activation=norm_act) self.global_pool = nn.AdaptiveAvgPool2d(1) self.dropout = nn.Dropout(drop_rate, inplace=True) self.classifier = nn.Linear(num_features, num_classes) patch_bn(self) # adjust epsilon initialize(self) if match_tf_same_padding: conv_to_same_conv(self) maxpool_to_same_maxpool(self)
def __init__( self, growth_rate=None, block_config=None, pretrained=None, # not used. here for proper signature num_classes=1000, drop_rate=0.0, in_channels=3, norm_layer='abn', norm_act='relu', deep_stem=False, stem_width=64, encoder=False, global_pool='avg', memory_efficient=True): super(DenseNet, self).__init__() norm_layer = bn_from_name(norm_layer) self.num_classes = num_classes if deep_stem: self.conv0 = nn.Sequential( conv3x3(in_channels, stem_width // 2, 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width // 2), norm_layer(stem_width // 2, activation=norm_act), conv3x3(stem_width // 2, stem_width, 2)) else: self.conv0 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False) self.norm0 = norm_layer(stem_width, activation=norm_act) self.pool0 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False) largs = dict(growth_rate=growth_rate, drop_rate=drop_rate, memory_efficient=memory_efficient, norm_layer=norm_layer, norm_act=norm_act) in_planes = stem_width for i, num_layers in enumerate(block_config): block = _DenseBlock(num_layers, in_planes, **largs) setattr(self, 'denseblock{}'.format(i + 1), block) in_planes += num_layers * growth_rate if i != len(block_config) - 1: trans = _Transition(in_planes=in_planes, out_planes=in_planes // 2) setattr(self, 'transition{}'.format(i + 1), trans) in_planes //= 2 # Final normalization self.norm5 = nn.BatchNorm2d(in_planes) # Linear layer self.encoder = encoder if not encoder: self.global_pool = GlobalPool2d(global_pool) self.classifier = nn.Linear(in_planes, num_classes) else: assert len(block_config) == 4, 'Need 4 blocks to use as encoder' self.forward = self.encoder_features
def __init__( self, layers=None, pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, width_factor=1.0, output_stride=32, norm_layer="inplaceabn", norm_act="leaky_relu", encoder=False, drop_rate=0.0, drop_connect_rate=0.0, ): nn.Module.__init__(self) stem_width = int(64 * width_factor) norm_layer = bn_from_name(norm_layer) self.inplanes = stem_width self.num_classes = num_classes self.groups = 1 # not really used but needed inside _make_layer self.base_width = 64 # used inside _make_layer self.norm_act = norm_act self.block_idx = 0 self.num_blocks = sum(layers) self.drop_connect_rate = drop_connect_rate # in the paper they use conv1x1 but in code conv3x3 (which seems better) self.conv1 = nn.Sequential(SpaceToDepth(), conv3x3(in_channels * 16, stem_width)) self.bn1 = norm_layer(stem_width, activation=norm_act) self.maxpool = nn.Identity( ) # not used but needed for code compatability if output_stride not in [8, 16, 32]: raise ValueError("Output stride should be in [8, 16, 32]") # TODO add OS later # if output_stride == 8: # stride_3, stride_4, dilation_3, dilation_4 = 1, 1, 2, 4 # elif output_stride == 16: # stride_3, stride_4, dilation_3, dilation_4 = 2, 1, 1, 2 # elif output_stride == 32: stride_3, stride_4, dilation_3, dilation_4 = 2, 2, 1, 1 largs = dict(use_se=True, norm_layer=norm_layer, norm_act=norm_act, antialias=True) self.block = TBasicBlock self.expansion = TBasicBlock.expansion self.layer1 = self._make_layer(stem_width, layers[0], stride=1, **largs) self.layer2 = self._make_layer(stem_width * 2, layers[1], stride=2, **largs) self.block = TBottleneck # first 2 - Basic, last 2 - Bottleneck self.expansion = TBottleneck.expansion self.layer3 = self._make_layer(stem_width * 4, layers[2], stride=stride_3, dilation=dilation_3, **largs) largs.update(use_se=False) # no se in last layer self.layer4 = self._make_layer(stem_width * 8, layers[3], stride=stride_4, dilation=dilation_4, **largs) self.global_pool = FastGlobalAvgPool2d(flatten=True) self.num_features = stem_width * 8 * self.expansion self.encoder = encoder if not encoder: self.dropout = nn.Dropout(p=drop_rate, inplace=True) self.last_linear = nn.Linear(self.num_features, num_classes) else: self.forward = self.encoder_features self._initialize_weights(init_bn0=True)
def __init__( self, stage_fns=None, # list of nn.Module block_fns=None, # list of nn.Module stage_args=None, # list of dicts layers=None, # num layers in each block channels=None, # it's actually output channels. 256, 512, 1024, 2048 for R50 # pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, norm_layer="abn", norm_act="leaky_relu", head_norm_act="leaky_relu", # activation in head stem_type="default", # antialias=False, # encoder=False, # drop_rate=0.0, drop_connect_rate=0.0, head_width=2048, stem_width=64, head_type="default", # type of head ): norm_layer = bn_from_name(norm_layer) self.num_classes = num_classes self.norm_act = norm_act self.block_idx = 0 # for drop connect self.drop_connect_rate = drop_connect_rate super().__init__() first_norm = nn.Identity() if block_fns[0].startswith( "Pre") else norm_layer(stem_width, activation=norm_act) if stem_type == "default": self.stem_conv1 = nn.Sequential( conv3x3(in_channels, stem_width, stride=2), first_norm) elif stem_type == "s2d": # instead of default stem I'm using Space2Depth followed by conv. no norm because there is one at the beginning # of DarkStage. upd. there is norm in not PreAct version self.stem_conv1 = nn.Sequential( SpaceToDepth(block_size=2), conv3x3(in_channels * 4, stem_width), first_norm, ) else: raise ValueError(f"Stem type `{stem_type}` is not supported") bn_args = dict(norm_layer=norm_layer, norm_act=norm_act) block_name_to_module = { "XX": SimpleBasicBlock, "Pre_XX": SimplePreActBasicBlock, "Pre_XX_Res2": SimplePreActRes2BasicBlock, "Btl": SimpleBottleneck, "Pre_Btl": SimplePreActBottleneck, "IR": SimpleInvertedResidual, "Pre_IR": SimplePreActInvertedResidual, "Sep2": SimpleSeparable_2, "Pre_Sep2": SimplePreActSeparable_2, "Sep3": SimpleSeparable_3, "Pre_Custom_2": PreBlock_2, } stage_name_to_module = {"simpl": SimpleStage} # set stride=2 for all blocks # using **{**bn_args, **stage_args} to allow updating norm layer for particular stage self.layer1 = stage_name_to_module[stage_fns[0]]( block_fn=block_name_to_module[block_fns[0]], in_chs=stem_width, out_chs=channels[0], num_blocks=layers[0], stride=2, **{ **bn_args, **stage_args[0] }, ) self.layer2 = stage_name_to_module[stage_fns[1]]( block_fn=block_name_to_module[block_fns[1]], in_chs=channels[0], out_chs=channels[1], num_blocks=layers[1], stride=2, **{ **bn_args, **stage_args[1] }, ) self.layer3 = stage_name_to_module[stage_fns[2]]( block_fn=block_name_to_module[block_fns[2]], in_chs=channels[1], out_chs=channels[2], num_blocks=layers[2], stride=2, **{ **bn_args, **stage_args[2] }, ) extra_stage3_filters = stage_args[2].get("filter_steps", 0) * (layers[2] - 1) self.layer4 = stage_name_to_module[stage_fns[3]]( block_fn=block_name_to_module[block_fns[3]], in_chs=channels[2] + extra_stage3_filters, out_chs=channels[3], num_blocks=layers[3], stride=2, **{ **bn_args, **stage_args[3] }, ) extra_stage4_filters = stage_args[3].get("filter_steps", 0) * (layers[3] - 1) channels[ 3] += extra_stage4_filters # TODO rewrite it cleaner instead of doing inplace last_norm = norm_layer(channels[3], activation=norm_act) if block_fns[0].startswith( "Pre") else nn.Identity() if head_type == "mobilenetv3": self.head = nn.Sequential( # Mbln v3 head. GAP first, then expand convs last_norm, FastGlobalAvgPool2d(flatten=True), nn.Linear(channels[3], head_width), pt.modules.activations.activation_from_name(head_norm_act), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "mobilenetv3_norm": # mobilenet with last norm self.head = nn.Sequential( # Mbln v3 head. GAP first, then expand convs last_norm, FastGlobalAvgPool2d(flatten=True), nn.Linear(channels[3], head_width), nn.BatchNorm1d(head_width), pt.modules.activations.activation_from_name(head_norm_act), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "default": self.head = nn.Sequential( last_norm, conv1x1(channels[3], head_width), norm_layer(head_width, activation=head_norm_act), FastGlobalAvgPool2d(flatten=True), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "default_nonorm": # if used in angular losses don't want norm self.head = nn.Sequential( last_norm, conv1x1(channels[3], head_width, bias=True), # need bias because not followed by norm FastGlobalAvgPool2d(flatten=True), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "mlp_bn_fc_bn": self.head = nn.Sequential( last_norm, conv1x1(channels[3], channels[3]), FastGlobalAvgPool2d(flatten=True), nn.BatchNorm1d(channels[3]), pt.modules.activations.activation_from_name(head_norm_act), nn.Linear(channels[3], head_width, bias=False), nn.BatchNorm1d(head_width, affine=False), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "mlp_bn_fc": # same as above but without last BN self.head = nn.Sequential( last_norm, conv1x1(channels[3], channels[3]), FastGlobalAvgPool2d(flatten=True), nn.BatchNorm1d(channels[3]), pt.modules.activations.activation_from_name(head_norm_act), nn.Linear(channels[3], head_width, bias=False), ) self.last_linear = nn.Linear(head_width, num_classes) elif head_type == "mlp_2": assert isinstance(head_width, (tuple, list)), head_width self.head = nn.Sequential( # like Mbln v3 head. GAP first, then MLP convs last_norm, FastGlobalAvgPool2d(flatten=True), nn.Linear(channels[3], head_width[0]), nn.BatchNorm1d(head_width[0]), pt.modules.activations.activation_from_name(head_norm_act), nn.Linear(head_width[0], head_width[1]), nn.BatchNorm1d(head_width[1]), pt.modules.activations.activation_from_name(head_norm_act), ) self.last_linear = nn.Linear(head_width[1], num_classes) elif head_type == "mlp_3": assert isinstance(head_width, (tuple, list)), head_width self.head = nn.Sequential( # like Mbln v3 head. GAP first, then MLP convs last_norm, FastGlobalAvgPool2d(flatten=True), nn.Linear(channels[3], head_width[0]), nn.BatchNorm1d(head_width[0]), pt.modules.activations.activation_from_name(head_norm_act), nn.Linear(head_width[0], head_width[1]), nn.BatchNorm1d(head_width[1]), pt.modules.activations.activation_from_name(head_norm_act), nn.Linear(head_width[1], head_width[2]), nn.BatchNorm1d(head_width[2]), pt.modules.activations.activation_from_name(head_norm_act), ) self.last_linear = nn.Linear(head_width[2], num_classes) else: raise ValueError(f"Head type: {head_type} is not supported!") initialize(self)
def __init__( self, stage_fn=None, block_fn=None, layers=None, # num layers in each block channels=None, # it's actually output channels. 256, 512, 1024, 2048 for R50 pretrained=None, # not used. here for proper signature num_classes=1000, in_channels=3, attn_type=None, # base_width=64, stem_type="default", norm_layer="abn", norm_act="leaky_relu", antialias=False, # encoder=False, bottle_ratio=0.25, # how much to shrink channels in bottleneck layer no_first_csp=False, # make first stage a Simple Stage drop_rate=0.0, drop_connect_rate=0.0, expand_before_head=True, # add addition conv from 512 -> 2048 to avoid representational bottleneck mobilenetv3_head=False, # put GAP first, then expand convs **block_kwargs, ): stem_width = 64 norm_layer = bn_from_name(norm_layer) self.num_classes = num_classes self.norm_act = norm_act self.block_idx = 0 # for drop connect self.drop_connect_rate = drop_connect_rate super().__init__() if block_fn != SimplePreActBottleneck: stem_norm = norm_layer(stem_width, activation=norm_act) else: stem_norm = nn.Identity() if stem_type == "default": self.stem_conv1 = nn.Sequential( nn.Conv2d(3, stem_width, kernel_size=7, stride=2, padding=3, bias=False), stem_norm, nn.MaxPool2d(kernel_size=3, stride=2, padding=1), ) first_stride = 1 elif stem_type == "s2d": # instead of default stem I'm using Space2Depth followed by conv. no norm because there is one at the beginning # of DarkStage. upd. there is norm in not PreAct version self.stem_conv1 = nn.Sequential( SpaceToDepth(block_size=2), conv3x3(in_channels * 4, stem_width), stem_norm, # nn.MaxPool2d(kernel_size=3, stride=2, padding=1), ) first_stride = 2 # blocks largs = dict( stride=2, bottle_ratio=bottle_ratio, block_fn=block_fn, attn_type=attn_type, norm_layer=norm_layer, norm_act=norm_act, # antialias=antialias, **block_kwargs, ) first_stage_fn = SimpleStage if no_first_csp else stage_fn # fmt: off self.layer1 = first_stage_fn( in_chs=stem_width, out_chs=channels[0], num_blocks=layers[0], keep_prob=self.keep_prob, **{ **largs, "stride": first_stride }, # overwrite default stride ) # **{**largs, "antialias": False} # antialias in first stage is too expensive self.layer2 = stage_fn(in_chs=channels[0], out_chs=channels[1], num_blocks=layers[1], keep_prob=self.keep_prob, **largs) self.layer3 = stage_fn(in_chs=channels[1], out_chs=channels[2], num_blocks=layers[2], keep_prob=self.keep_prob, **largs) self.layer4 = stage_fn(in_chs=channels[2], out_chs=channels[3], num_blocks=layers[3], keep_prob=self.keep_prob, **largs) # fmt: on # self.global_pool = FastGlobalAvgPool2d(flatten=True) # self.dropout = nn.Dropout(p=drop_rate, inplace=True) head_layers = [] # this is a very dirty if but i don't care for now if mobilenetv3_head: head_layers.append(FastGlobalAvgPool2d(flatten=True)) if channels[3] < 2048 and expand_before_head: head_layers.append( nn.Linear(channels[3], 2048) ) # no norm here as in original MobilnetV3 from google head_layers.append( pt.modules.activations.activation_from_name(norm_act)) head_layers.append( nn.Linear(2048 if expand_before_head else channels[3], num_classes)) else: if channels[3] < 2048 and expand_before_head: if block_fn == SimplePreActBottleneck: # for PreAct add additional BN here head_layers.append( norm_layer(channels[3], activation=norm_act)) head_layers.extend([ conv1x1(channels[3], 2048), norm_layer(2048, activation=norm_act) ]) head_layers.extend([ FastGlobalAvgPool2d(flatten=True), nn.Linear(2048 if expand_before_head else channels[3], num_classes) ]) # self.head = nn.Sequential( # conv1x1(channels[3], 2048), # norm_layer(activation=norm_act), # # norm_layer(1024, activation=norm_act), # FastGlobalAvgPool2d(flatten=True), # nn.Linear(2048, num_classes), # ) self.head = nn.Sequential(*head_layers) initialize(self)