def __init__(self, w_in, w_out, stride, norm, activation_class, _params): super().__init__() self.a = conv2d(w_in, w_out, 3, stride=stride) self.a_bn = get_norm(norm, w_out) self.a_af = activation_class() self.b = conv2d(w_out, w_out, 3) self.b_bn = get_norm(norm, w_out) self.b_bn.final_bn = True
def __init__(self, w_in, w_out, stride, norm, activation_class, params): super().__init__() w_b = int(round(w_out * params["bot_mul"])) w_se = int(round(w_in * params["se_r"])) groups = w_b // params["group_w"] self.a = conv2d(w_in, w_b, 1) self.a_bn = get_norm(norm, w_b) self.a_af = activation_class() self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups) self.b_bn = get_norm(norm, w_b) self.b_af = activation_class() self.se = SE(w_b, w_se, activation_class) if w_se else None self.c = conv2d(w_b, w_out, 1) self.c_bn = get_norm(norm, w_out) self.c_bn.final_bn = True
def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature num_classes (int): the number of foreground classes (i.e. background is not included). 1 if using class agnostic prediction. conv_dims (list[int]): a list of N>0 integers representing the output dimensions of N-1 conv layers and the last upsample layer. conv_norm (str or callable): normalization for the conv layers. See :func:`detectron2.layers.get_norm` for supported types. """ super().__init__(**kwargs) assert len(conv_dims) >= 1, "conv_dims have to be non-empty!" self.conv_norm_relus = [] cur_channels = input_shape.channels for k, conv_dim in enumerate(conv_dims[:-1]): conv = Conv2d( cur_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=not conv_norm, norm=get_norm(conv_norm, conv_dim), activation=nn.ReLU(), ) self.add_module("mask_fcn{}".format(k + 1), conv) self.conv_norm_relus.append(conv) cur_channels = conv_dim self.deconv = ConvTranspose2d(cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0) self.add_module("deconv_relu", nn.ReLU()) cur_channels = conv_dims[-1] self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0) for layer in self.conv_norm_relus + [self.deconv]: weight_init.c2_msra_fill(layer) # use normal distribution initialization for mask prediction layer nn.init.normal_(self.predictor.weight, std=0.001) if self.predictor.bias is not None: nn.init.constant_(self.predictor.bias, 0)
def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int): Stride for the first conv. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer)
def __init__(self, w_in, w_out, stride, norm, activation_class, params): super().__init__(w_in, w_out, stride) self.proj, self.bn = None, None if (w_in != w_out) or (stride != 1): self.proj = conv2d(w_in, w_out, 1, stride=stride) self.bn = get_norm(norm, w_out) self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params) self.af = activation_class()
def __init__( self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm="" ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature. conv_dims (list[int]): the output dimensions of the conv layers fc_dims (list[int]): the output dimensions of the fc layers conv_norm (str or callable): normalization for the conv layers. See :func:`detectron2.layers.get_norm` for supported types. """ super().__init__() assert len(conv_dims) + len(fc_dims) > 0 self._output_size = (input_shape.channels, input_shape.height, input_shape.width) self.conv_norm_relus = [] for k, conv_dim in enumerate(conv_dims): conv = Conv2d( self._output_size[0], conv_dim, kernel_size=3, padding=1, bias=not conv_norm, norm=get_norm(conv_norm, conv_dim), activation=nn.ReLU(), ) self.add_module("conv{}".format(k + 1), conv) self.conv_norm_relus.append(conv) self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) self.fcs = [] for k, fc_dim in enumerate(fc_dims): if k == 0: self.add_module("flatten", nn.Flatten()) fc = nn.Linear(int(np.prod(self._output_size)), fc_dim) self.add_module("fc{}".format(k + 1), fc) self.add_module("fc_relu{}".format(k + 1), nn.ReLU()) self.fcs.append(fc) self._output_size = fc_dim for layer in self.conv_norm_relus: weight_init.c2_msra_fill(layer) for layer in self.fcs: weight_init.c2_xavier_fill(layer)
def __init__(self, in_channels=3, out_channels=64, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1)
def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, ): super().__init__(in_channels, out_channels, stride) self.deform_modulated = deform_modulated if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if deform_modulated: deform_conv_op = ModulatedDeformConv # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size offset_channels = 27 else: deform_conv_op = DeformConv offset_channels = 18 self.conv2_offset = Conv2d( bottleneck_channels, offset_channels * deform_num_groups, kernel_size=3, stride=stride_3x3, padding=1 * dilation, dilation=dilation, ) self.conv2 = deform_conv_op( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, deformable_groups=deform_num_groups, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) nn.init.constant_(self.conv2_offset.weight, 0) nn.init.constant_(self.conv2_offset.bias, 0)
def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, ): """ Args: bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. num_groups (int): number of groups for the 3x3 conv layer. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. stride_in_1x1 (bool): when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. dilation (int): the dilation rate of the 3x3 conv layer. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer)
def __init__( self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum" ): """ Args: bottom_up (Backbone): module representing the bottom up subnetwork. Must be a subclass of :class:`Backbone`. The multi-scale feature maps generated by the bottom up network, and listed in `in_features`, are used to generate FPN levels. in_features (list[str]): names of the input feature maps coming from the backbone to which FPN is attached. For example, if the backbone produces ["res2", "res3", "res4"], any *contiguous* sublist of these may be used; order must be from high to low resolution. out_channels (int): number of channels in the output feature maps. norm (str): the normalization to use. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) FPN output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra FPN levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). fuse_type (str): types for fusing the top down features and the lateral ones. It can be "sum" (default), which sums up element-wise; or "avg", which takes the element-wise mean of the two. """ super(FPN, self).__init__() assert isinstance(bottom_up, Backbone) assert in_features, in_features # Feature map strides and channels from the bottom up network (e.g. ResNet) input_shapes = bottom_up.output_shape() strides = [input_shapes[f].stride for f in in_features] in_channels_per_feature = [input_shapes[f].channels for f in in_features] _assert_strides_are_log2_contiguous(strides) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(in_channels_per_feature): lateral_norm = get_norm(norm, out_channels) output_norm = get_norm(norm, out_channels) lateral_conv = Conv2d( in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) stage = int(math.log2(strides[idx])) self.add_module("fpn_lateral{}".format(stage), lateral_conv) self.add_module("fpn_output{}".format(stage), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.top_block = top_block self.in_features = tuple(in_features) self.bottom_up = bottom_up # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] assert fuse_type in {"avg", "sum"} self._fuse_type = fuse_type
def __init__(self, w_in, w_out, norm, activation_class): super().__init__(w_in, w_out, 2) self.conv = conv2d(w_in, w_out, 3, stride=2) self.bn = get_norm(norm, w_out) self.af = activation_class()
def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, conv_dims: int, common_stride: int, loss_weight: float = 1.0, norm: Optional[Union[str, Callable]] = None, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict conv_dims: number of output channels for the intermediate conv layers. common_stride: the common stride that all features will be upscaled to loss_weight: loss weight norm (str or callable): normalization for all conv layers ignore_value: category id to be ignored during training. """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = common_stride self.loss_weight = loss_weight self.scale_heads = [] for in_feature, stride, channels in zip(self.in_features, feature_strides, feature_channels): head_ops = [] head_length = max( 1, int(np.log2(stride) - np.log2(self.common_stride))) for k in range(head_length): norm_module = get_norm(norm, conv_dims) conv = Conv2d( channels if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=norm_module, activation=F.relu, ) weight_init.c2_msra_fill(conv) head_ops.append(conv) if stride != self.common_stride: head_ops.append( nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)) self.scale_heads.append(nn.Sequential(*head_ops)) self.add_module(in_feature, self.scale_heads[-1]) self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) weight_init.c2_msra_fill(self.predictor)