def __init__(self, C_in, C_out, norm_layer, affine=True, input_size=None): super(FactorizedReduce, self).__init__() assert C_out % 2 == 0 self.conv_1 = Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) self.conv_2 = Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) self.bn = norm_layer(C_out, affine=affine) self.flops = self.get_flop([1, 1], 2, C_in, C_out, affine, input_size[0], input_size[1]) # using Kaiming init for layer in [self.conv_1, self.conv_2]: for m in layer.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, cfg, input_shape: ShapeSpec): """ The following attributes are parsed from config: conv_dim: the output dimension of the conv layers fc_dim: the feature dimenstion of the FC layers num_fc: the number of FC layers output_side_resolution: side resolution of the output square mask prediction """ super(CoarseMaskHead, self).__init__() # fmt: off self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM self.fc_dim = cfg.MODEL.ROI_MASK_HEAD.FC_DIM num_fc = cfg.MODEL.ROI_MASK_HEAD.NUM_FC self.output_side_resolution = cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION self.input_channels = input_shape.channels self.input_h = input_shape.height self.input_w = input_shape.width # fmt: on self.conv_layers = [] if self.input_channels > conv_dim: self.reduce_channel_dim_conv = Conv2d( self.input_channels, conv_dim, kernel_size=1, stride=1, padding=0, bias=True, activation=F.relu, ) self.conv_layers.append(self.reduce_channel_dim_conv) self.reduce_spatial_dim_conv = Conv2d( conv_dim, conv_dim, kernel_size=2, stride=2, padding=0, bias=True, activation=F.relu ) self.conv_layers.append(self.reduce_spatial_dim_conv) input_dim = conv_dim * self.input_h * self.input_w input_dim //= 4 self.fcs = [] for k in range(num_fc): fc = nn.Linear(input_dim, self.fc_dim) self.add_module("coarse_mask_fc{}".format(k + 1), fc) self.fcs.append(fc) input_dim = self.fc_dim output_dim = self.num_classes * self.output_side_resolution * self.output_side_resolution self.prediction = nn.Linear(self.fc_dim, output_dim) # use normal distribution initialization for mask prediction layer nn.init.normal_(self.prediction.weight, std=0.001) nn.init.constant_(self.prediction.bias, 0) for layer in self.conv_layers: weight_init.c2_msra_fill(layer) for layer in self.fcs: weight_init.c2_xavier_fill(layer)
def __init__(self, block_args, global_params): """ Args: block_args (EasyDict): block args, see: class: `EfficientNet`. global_params (EasyDict): global args, see: class: `EfficientNet`. """ super().__init__() self._block_args = block_args self.has_se = (block_args.se_ratio is not None) and (0 < block_args.se_ratio <= 1) self.id_skip = block_args.id_skip # Expansion phase # number of input channels inp = block_args.in_channels # number of output channels oup = block_args.in_channels * block_args.expand_ratio if block_args.expand_ratio != 1: self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, padding=0, bias=False) self._bn0 = get_norm(global_params.norm, out_channels=oup) # Depthwise convolution phase k = block_args.kernel_size s = block_args.stride self._depthwise_conv = Conv2d(in_channels=oup, out_channels=oup, groups=oup, kernel_size=k, stride=s, padding="SAME", bias=False) self._bn1 = get_norm(global_params.norm, out_channels=oup) # Squeeze and Excitation layer, if desired if self.has_se: num_squeezed_channels = max( 1, int(block_args.in_channels * block_args.se_ratio)) self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1, padding=0) self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1, padding=0) # Output phase final_oup = block_args.out_channels self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, padding=0, bias=False) self._bn2 = get_norm(global_params.norm, final_oup) self._swish = MemoryEfficientSwish()
def __init__(self, in_channels, out_channels, *, stride=1, norm="BN", activation=None, **kwargs): """ The standard block type for ResNet18 and ResNet34. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int): Stride for the first conv. norm (str or callable): A callable that takes the number of channels and returns a `nn.Module`, or a pre-defined string (one of {"FrozenBN", "BN", "GN"}). """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.activation = get_activation(activation) self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer)
def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1, bias=True, radix=2, reduction_factor=4, rectify=False, rectify_avg=False, norm=None, dropblock_prob=0.0, **kwargs): super(SplAtConv2d, self).__init__() padding = _pair(padding) self.rectify = rectify and (padding[0] > 0 or padding[1] > 0) self.rectify_avg = rectify_avg inter_channels = max(in_channels * radix // reduction_factor, 32) self.radix = radix self.cardinality = groups self.channels = channels self.dropblock_prob = dropblock_prob if self.rectify: self.conv = RFConv2d(in_channels, channels * radix, kernel_size, stride, padding, dilation, groups=groups * radix, bias=bias, average_mode=rectify_avg, **kwargs) else: self.conv = Conv2d(in_channels, channels * radix, kernel_size, stride, padding, dilation, groups=groups * radix, bias=bias, **kwargs) self.use_bn = norm is not None self.bn0 = get_norm(norm, channels * radix) self.relu = ReLU(inplace=True) self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality) self.bn1 = get_norm(norm, inter_channels) self.fc2 = Conv2d(inter_channels, channels * radix, 1, groups=self.cardinality) if dropblock_prob > 0.0: self.dropblock = DropBlock2D(dropblock_prob, 3)
def __init__(self, C_in, C_out, kernel_size, stride, padding, norm_layer, affine=True, input_size=None): super(SepConv, self).__init__() self.op = nn.Sequential( # depth wise Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False), # point wise Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False, norm=get_norm(norm_layer, C_in), activation=nn.ReLU()), # stack 2 separate depthwise-conv. Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False), Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False, norm=get_norm(norm_layer, C_out))) self.flops = self.get_flop([kernel_size, kernel_size], stride, C_in, C_out, affine, input_size[0], input_size[1]) # using Kaiming init for m in self.op.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, input_channels, output_channels, stride, expand_ratio, norm, activation, use_shortcut=True): super(InvertedResBlock, self).__init__() self.stride = stride assert stride in [1, 2] mid_channels = int(round(input_channels * expand_ratio)) self.use_shortcut = use_shortcut if self.use_shortcut: assert stride == 1 assert input_channels == output_channels conv_kwargs = { "norm": get_norm(norm, mid_channels), "activation": get_activation(activation) } layers = [] if expand_ratio > 1: layers.append( Conv2d( input_channels, mid_channels, 1, bias=False, # Pixel-wise non-linear **deepcopy(conv_kwargs))) layers += [ Conv2d( mid_channels, mid_channels, 3, padding=1, bias=False, # Depth-wise 3x3 stride=stride, groups=mid_channels, **deepcopy(conv_kwargs)), Conv2d( mid_channels, output_channels, 1, bias=False, # Pixel-wise linear norm=get_norm(norm, output_channels)) ] self.conv = nn.Sequential(*layers)
def __init__(self, C_in, C_out, kernel_size, stride, padding, norm_layer, expansion=4, affine=True, input_size=None): super(MBConv, self).__init__() self.hidden_dim = expansion * C_in self.op = nn.Sequential( # pw Conv2d(C_in, self.hidden_dim, 1, 1, 0, bias=False, norm=get_norm(norm_layer, self.hidden_dim), activation=nn.ReLU()), # dw Conv2d(self.hidden_dim, self.hidden_dim, kernel_size, stride, padding, groups=self.hidden_dim, bias=False, norm=get_norm(norm_layer, self.hidden_dim), activation=nn.ReLU()), # pw-linear without ReLU! Conv2d(self.hidden_dim, C_out, 1, 1, 0, bias=False, norm=get_norm(norm_layer, C_out))) self.flops = self.get_flop([kernel_size, kernel_size], stride, C_in, C_out, affine, input_size[0], input_size[1]) # using Kaiming init for m in self.op.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, cfg, input_shape: ShapeSpec): """ The following attributes are parsed from config: num_conv: the number of conv layers conv_dim: the dimension of the conv layers norm: normalization for the conv layers """ super(MaskRCNNConvUpsampleHead, self).__init__() # fmt: off num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES conv_dims = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM self.norm = cfg.MODEL.ROI_MASK_HEAD.NORM num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV input_channels = input_shape.channels cls_agnostic_mask = cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK # fmt: on self.conv_norm_relus = [] for k in range(num_conv): conv = Conv2d( input_channels if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not self.norm, norm=get_norm(self.norm, conv_dims), activation=F.relu, ) self.add_module("mask_fcn{}".format(k + 1), conv) self.conv_norm_relus.append(conv) self.deconv = ConvTranspose2d( conv_dims if num_conv > 0 else input_channels, conv_dims, kernel_size=2, stride=2, padding=0, ) num_mask_classes = 1 if cls_agnostic_mask else num_classes self.predictor = Conv2d(conv_dims, num_mask_classes, kernel_size=1, stride=1, padding=0) for layer in self.conv_norm_relus + [self.deconv]: weight_init.c2_msra_fill(layer) # use normal distribution initialization for mask prediction layer nn.init.normal_(self.predictor.weight, std=0.001) if self.predictor.bias is not None: nn.init.constant_(self.predictor.bias, 0)
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() # fmt: off self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES feature_strides = {k: v.stride for k, v in input_shape.items()} feature_channels = {k: v.channels for k, v in input_shape.items()} self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES conv_dims = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM self.common_stride = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE norm = cfg.MODEL.SEM_SEG_HEAD.NORM self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT # fmt: on self.scale_heads = [] for in_feature in self.in_features: head_ops = [] head_length = max( 1, int( np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))) for k in range(head_length): norm_module = nn.GroupNorm(32, conv_dims) if norm == "GN" else None conv = Conv2d( feature_channels[in_feature] if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=norm_module, activation=F.relu, ) weight_init.c2_msra_fill(conv) head_ops.append(conv) if feature_strides[in_feature] != self.common_stride: head_ops.append( nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)) self.scale_heads.append(nn.Sequential(*head_ops)) self.add_module(in_feature, self.scale_heads[-1]) self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) weight_init.c2_msra_fill(self.predictor)
def __init__(self, C_in, C_out, kernel_size, stride, padding, norm_layer, affine=True, input_size=None): super(BasicResBlock, self).__init__() self.op = Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False, norm=get_norm(norm_layer, C_out)) self.flops = self.get_flop([kernel_size, kernel_size], stride, C_in, C_out, affine, input_size[0], input_size[1]) # using Kaiming init for m in self.op.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, cfg): super(Classification, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.network = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.network.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm("BN", 64)), nn.ReLU(), ) self.loss_evaluator = nn.CrossEntropyLoss() pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, in_channels=3, out_channels=64, norm="BN", activation=None): """ Args: norm (str or callable): a callable that takes the number of channels and return a `nn.Module`, or a pre-defined string (one of {"FrozenBN", "BN", "GN"}). """ super().__init__() self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1) self.activation = get_activation(activation) self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def __init__(self, cfg): super(Classification, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.network = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.network.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)), nn.ReLU(), ) self.freeze() self.network.eval() # init the fc layer self.network.linear.weight.data.normal_(mean=0.0, std=0.01) self.network.linear.bias.data.zero_() self.loss_evaluator = nn.CrossEntropyLoss() pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 1, 3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, input_channels, output_channels, stride, expand_ratio, norm, activation, use_shortcut=True): """ Args: input_channels (int): the input channel number. output_channels (int): the output channel number. stride (int): the stride of the current block. expand_ratio(int): the channel expansion ratio for `mid_channels` in InvertedResBlock. norm (str or callable): a callable that takes the number of channels and return a `nn.Module`, or a pre-defined string (See cvpods.layer.get_norm for more details). activation (str): a pre-defined string (See cvpods.layer.get_activation for more details). use_shortcut (bool): whether to use the residual path. """ super(InvertedResBlock, self).__init__() self.stride = stride assert stride in [1, 2] mid_channels = int(round(input_channels * expand_ratio)) self.use_shortcut = use_shortcut if self.use_shortcut: assert stride == 1 assert input_channels == output_channels conv_kwargs = { "norm": get_norm(norm, mid_channels), "activation": get_activation(activation) } layers = [] if expand_ratio > 1: layers.append( Conv2d(input_channels, mid_channels, 1, bias=False, # Pixel-wise non-linear **deepcopy(conv_kwargs)) ) layers += [ Conv2d(mid_channels, mid_channels, 3, padding=1, bias=False, # Depth-wise 3x3 stride=stride, groups=mid_channels, **deepcopy(conv_kwargs)), Conv2d(mid_channels, output_channels, 1, bias=False, # Pixel-wise linear norm=get_norm(norm, output_channels)) ] self.conv = nn.Sequential(*layers)
def __init__(self, cfg): super(SimSiam, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM self.pred_dim = cfg.MODEL.BYOL.PRED_DIM self.out_dim = cfg.MODEL.BYOL.OUT_DIM self.total_steps = cfg.SOLVER.LR_SCHEDULER.MAX_ITER * cfg.SOLVER.BATCH_SUBDIVISIONS # create the encoders # num_classes is the output fc dimension cfg.MODEL.RESNETS.NUM_CLASSES = self.out_dim self.encoder = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.encoder.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)), nn.ReLU(), ) self.size_divisibility = self.encoder.size_divisibility dim_mlp = self.encoder.linear.weight.shape[1] # Projection Head self.encoder.linear = nn.Sequential( nn.Linear(dim_mlp, self.proj_dim), nn.SyncBatchNorm(self.proj_dim), nn.ReLU(), nn.Linear(self.proj_dim, self.proj_dim), nn.SyncBatchNorm(self.proj_dim), ) # Predictor self.predictor = nn.Sequential( nn.Linear(self.proj_dim, self.pred_dim), nn.SyncBatchNorm(self.pred_dim), nn.ReLU(), nn.Linear(self.pred_dim, self.out_dim), ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 1, 3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def __init__( self, in_channels=3, out_channels=64, norm="BN", activation=None, deep_stem=False, stem_width=32, ): super().__init__() self.conv1_1 = Conv2d( 3, stem_width, kernel_size=3, stride=2, padding=1, bias=False, norm=get_norm(norm, stem_width), ) self.conv1_2 = Conv2d( stem_width, stem_width, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, stem_width), ) self.conv1_3 = Conv2d( stem_width, stem_width * 2, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, stem_width * 2), ) for layer in [self.conv1_1, self.conv1_2, self.conv1_3]: if layer is not None: weight_init.c2_msra_fill(layer) self.activation = get_activation(activation) self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def __init__(self, in_channels, out_channels, stride=1, norm="BN", activation=None): super().__init__() if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.activation = get_activation(activation) self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), )
def __init__(self, input_channels, output_channels, norm, activation): super().__init__() self.input_channels = input_channels self.output_channels = output_channels self.stride = 2 self.conv = Conv2d(input_channels, output_channels, 3, stride=2, padding=1, bias=False, norm=get_norm(norm, output_channels), activation=get_activation(activation))
def __init__(self, cfg, input_shape: ShapeSpec): """ The following attributes are parsed from config: num_conv, num_fc: the number of conv/fc layers conv_dim/fc_dim: the dimension of the conv/fc layers norm: normalization for the conv layers """ super().__init__() # fmt: off num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM norm = cfg.MODEL.ROI_BOX_HEAD.NORM # fmt: on assert num_conv + num_fc > 0 self._output_size = (input_shape.channels, input_shape.height, input_shape.width) self.conv_norm_relus = [] for k in range(num_conv): conv = Conv2d( self._output_size[0], conv_dim, kernel_size=3, padding=1, bias=not norm, norm=get_norm(norm, conv_dim), activation=F.relu, ) self.add_module("conv{}".format(k + 1), conv) self.conv_norm_relus.append(conv) self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) self.fcs = [] for k in range(num_fc): fc = nn.Linear(np.prod(self._output_size), fc_dim) self.add_module("fc{}".format(k + 1), fc) self.fcs.append(fc) self._output_size = fc_dim for layer in self.conv_norm_relus: weight_init.c2_msra_fill(layer) for layer in self.fcs: weight_init.c2_xavier_fill(layer)
def __init__(self, cfg, input_shape: ShapeSpec): """ The following attributes are parsed from config: conv_dims: an iterable of output channel counts for each conv in the head e.g. (512, 512, 512) for three convs outputting 512 channels. num_keypoints: number of keypoint heatmaps to predicts, determines the number of channels in the final output. """ super(KRCNNConvDeconvUpsampleHead, self).__init__() # fmt: off # default up_scale to 2 (this can eventually be moved to config) up_scale = 2 conv_dims = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS in_channels = input_shape.channels # fmt: on self.blocks = [] for idx, layer_channels in enumerate(conv_dims, 1): module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1) self.add_module("conv_fcn{}".format(idx), module) self.blocks.append(module) in_channels = layer_channels deconv_kernel = 4 self.score_lowres = ConvTranspose2d(in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1) self.up_scale = up_scale for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
def __init__(self, in_channels, out_channels, norm="BN"): """ Args: in_channels (int): the number of input tensor channels. out_channels (int): the number of output tensor channels. norm (str): the normalization to use. """ super().__init__() self.num_levels = 2 self.in_feature = "stage8" self.p6_conv = Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, norm=get_norm(norm, out_channels), activation=None) self.down_sampling = MaxPool2d(kernel_size=3, stride=2, padding="SAME")
def __init__(self, input_channels, output_channels, norm, activation): """ Args: input_channels (int): the input channel number. output_channels (int): the output channel number. norm (str or callable): a callable that takes the number of channels and return a `nn.Module`, or a pre-defined string (one of {"FrozenBN", "BN", "GN"}). activation (str): a pre-defined string (See cvpods.layer.get_activation for more details). """ super().__init__() self.input_channels = input_channels self.output_channels = output_channels self.stride = 2 self.conv = Conv2d(input_channels, output_channels, 3, stride=2, padding=1, bias=False, norm=get_norm(norm, output_channels), activation=get_activation(activation))
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES feature_strides = {k: v.stride for k, v in input_shape.items()} feature_channels = {k: v.channels for k, v in input_shape.items()} self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT upsampling_strides = [] feature_strides_list = list(feature_strides.values()) upsampling_strides.append(feature_strides_list[0]) feature_strides_list = feature_strides_list[::-1] for s1, s2 in zip(feature_strides_list[:], feature_strides_list[1:]): upsampling_strides.append(s1 // s2) assert len(upsampling_strides) == len(self.in_features) score_convs = [] upsampling_convs = [] for idx, in_feature in enumerate(self.in_features): ch = feature_channels[in_feature] score_convs.append(Conv2d(ch, num_classes, kernel_size=1)) stride = upsampling_strides[idx] upsampling_convs.append( ConvTranspose2d( num_classes, num_classes, kernel_size=stride * 2, stride=stride, padding=1, bias=False, )) self.score_convs = nn.ModuleList(score_convs) self.upsampling_convs = nn.ModuleList(upsampling_convs) self._initialize_weights()
def __init__(self, C_in, C_out, norm_layer, affine=True, input_size=None): super(Identity, self).__init__() if C_in == C_out: self.change = False self.flops = 0.0 else: self.change = True self.op = Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False, norm=get_norm(norm_layer, C_out)) self.flops = self.get_flop([1, 1], 1, C_in, C_out, affine, input_size[0], input_size[1]) # using Kaiming init for m in self.op.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() # fmt: off self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES feature_strides = {k: v.stride for k, v in input_shape.items()} # noqa:F841 feature_channels = {k: v.channels for k, v in input_shape.items()} feature_resolution = { k: np.array([v.height, v.width]) for k, v in input_shape.items() } self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES norm = cfg.MODEL.SEM_SEG_HEAD.NORM self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT self.cal_flops = cfg.MODEL.CAL_FLOPS self.real_flops = 0.0 # fmt: on self.layer_decoder_list = nn.ModuleList() # set affine in BatchNorm if 'Sync' in norm: affine = True else: affine = False # use simple decoder for _feat in self.in_features: res_size = feature_resolution[_feat] in_channel = feature_channels[_feat] if _feat == 'layer_0': out_channel = in_channel else: out_channel = in_channel // 2 conv_1x1 = Conv2d(in_channel, out_channel, kernel_size=1, stride=1, padding=0, bias=False, norm=get_norm(norm, out_channel), activation=nn.ReLU()) self.real_flops += cal_op_flops.count_ConvBNReLU_flop( res_size[0], res_size[1], in_channel, out_channel, [1, 1], is_affine=affine) self.layer_decoder_list.append(conv_1x1) # using Kaiming init for layer in self.layer_decoder_list: for m in layer.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0) in_channel = feature_channels['layer_0'] # the output layer self.predictor = Conv2d(in_channels=in_channel, out_channels=num_classes, kernel_size=3, stride=1, padding=1) self.real_flops += cal_op_flops.count_Conv_flop( feature_resolution['layer_0'][0], feature_resolution['layer_0'][1], in_channel, num_classes, [3, 3]) # using Kaiming init for m in self.predictor.modules(): if isinstance(m, nn.Conv2d): weight_init.kaiming_init(m, mode='fan_in') elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__( self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum" ): """ Args: bottom_up (Backbone): module representing the bottom up subnetwork. Must be a subclass of :class:`Backbone`. The multi-scale feature maps generated by the bottom up network, and listed in `in_features`, are used to generate FPN levels. in_features (list[str]): names of the input feature maps coming from the backbone to which FPN is attached. For example, if the backbone produces ["res2", "res3", "res4"], any *contiguous* sublist of these may be used; order must be from high to low resolution. out_channels (int): number of channels in the output feature maps. norm (str): the normalization to use. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) FPN output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra FPN levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). fuse_type (str): types for fusing the top down features and the lateral ones. It can be "sum" (default), which sums up element-wise; or "avg", which takes the element-wise mean of the two. """ super(FPN, self).__init__() assert isinstance(bottom_up, Backbone) # Feature map strides and channels from the bottom up network (e.g. ResNet) input_shapes = bottom_up.output_shape() in_strides = [input_shapes[f].stride for f in in_features] in_channels = [input_shapes[f].channels for f in in_features] _assert_strides_are_log2_contiguous(in_strides) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(in_channels): lateral_norm = get_norm(norm, out_channels) output_norm = get_norm(norm, out_channels) lateral_conv = Conv2d( in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) stage = int(math.log2(in_strides[idx])) self.add_module("fpn_lateral{}".format(stage), lateral_conv) self.add_module("fpn_output{}".format(stage), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.top_block = top_block self.in_features = in_features self.bottom_up = bottom_up # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in in_strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = in_strides[-1] assert fuse_type in {"avg", "sum"} self._fuse_type = fuse_type
def __init__(self, in_channels, channels, num_classes=None, dropout=False, out_features=None, norm="BN"): """ See: https://arxiv.org/pdf/1903.11752.pdf Args: num_blocks (int): the number of blocks in this stage. in_channels (int): the input channel number. channels (int): output channel numbers for stem and every stages. num_classes (None or int): if None, will not perform classification. dropout (bool): whether to use dropout. out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "linear", or "snet3" ... If None, will return the output of the last layer. norm (str or callable): a callable that takes the number of channels and return a `nn.Module`, or a pre-defined string (See cvpods.layer.get_norm for more details). """ super(ShuffleNetV2, self).__init__() self.stage_out_channels = channels self.num_classes = num_classes # ---------------- Stem ---------------------- # input_channels = self.stage_out_channels[0] self.stem = nn.Sequential(*[ Conv2d( in_channels, input_channels, kernel_size=3, stride=2, padding=1, bias=False, norm=get_norm(norm, input_channels), activation=nn.ReLU(inplace=True), ), nn.MaxPool2d(kernel_size=3, stride=2, padding=1), ]) # TODO: use a stem class and property stride current_stride = 4 self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": input_channels} # ---------------- Stages --------------------- # self.stage_num_blocks = [4, 8, 4] self.stages_and_names = [] for i in range(len(self.stage_num_blocks)): num_blocks = self.stage_num_blocks[i] output_channels = self.stage_out_channels[i + 1] name = "snet" + str(i + 3) block_list = make_stage(num_blocks, input_channels, output_channels, norm) current_stride = current_stride * np.prod( [block.stride for block in block_list]) stages = nn.Sequential(*block_list) self._out_feature_strides[name] = current_stride self._out_feature_channels[name] = output_channels self.add_module(name, stages) self.stages_and_names.append((stages, name)) input_channels = output_channels if len(self.stage_out_channels) == len(self.stage_num_blocks) + 2: name = "snet" + str(len(self.stage_num_blocks) + 2) + "-last" last_output_channels = self.stage_out_channels[-1] last_conv = Conv2d(output_channels, last_output_channels, kernel_size=1, bias=False, norm=get_norm(norm, last_output_channels), activation=nn.ReLU(inplace=True)) self._out_feature_strides[name] = current_stride self._out_feature_channels[name] = last_output_channels self.add_module(name, last_conv) self.stages_and_names.append((last_conv, name)) # ---------------- Classifer ------------------- # if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.dropout = dropout if dropout: self.dropout = nn.Dropout(0.2) self.classifier = nn.Linear(self.stage_out_channels[-1], num_classes, bias=False) name = "linear" self._out_features = [name] if out_features is None else out_features self._initialize_weights()
def __init__(self, in_channels=3, out_channels=64, norm="BN", activation=None, deep_stem=False, stem_width=32): """ Args: norm (str or callable): a callable that takes the number of channels and return a `nn.Module`, or a pre-defined string (one of {"FrozenBN", "BN", "GN"}). """ super().__init__() self.deep_stem = deep_stem if self.deep_stem: self.conv1_1 = Conv2d( 3, stem_width, kernel_size=3, stride=2, padding=1, bias=False, norm=get_norm(norm, stem_width), ) self.conv1_2 = Conv2d( stem_width, stem_width, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, stem_width), ) self.conv1_3 = Conv2d( stem_width, stem_width * 2, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, stem_width * 2), ) for layer in [self.conv1_1, self.conv1_2, self.conv1_3]: if layer is not None: weight_init.c2_msra_fill(layer) else: self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1) self.activation = get_activation(activation) self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", activation=None, stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, ): """ Similar to :class:`BottleneckBlock`, but with deformable conv in the 3x3 convolution. """ super().__init__(in_channels, out_channels, stride) self.deform_modulated = deform_modulated if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.activation = get_activation(activation) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if deform_modulated: deform_conv_op = ModulatedDeformConv # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size offset_channels = 27 else: deform_conv_op = DeformConv offset_channels = 18 self.conv2_offset = Conv2d( bottleneck_channels, offset_channels * deform_num_groups, kernel_size=3, stride=stride_3x3, padding=1 * dilation, dilation=dilation, ) self.conv2 = deform_conv_op( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, deformable_groups=deform_num_groups, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) nn.init.constant_(self.conv2_offset.weight, 0) nn.init.constant_(self.conv2_offset.bias, 0)