def __init__(self, cfg, in_channels): super(ResNetConv52MLPFeatureExtractor, self).__init__() stage = resnet.StageSpec(index=4, block_count=3, return_features=False) head = resnet.ResNetHead( block_module=cfg.MODEL.RESNETS.TRANS_FUNC, stages=(stage, ), num_groups=cfg.MODEL.RESNETS.NUM_GROUPS, width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP, stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1, stride_init=1, res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS, dilation=cfg.MODEL.RESNETS.RES5_DILATION, ) in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index - 1) if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL: new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1) nn.init.kaiming_uniform_(new_conv.weight, a=1) nn.init.constant_(new_conv.bias, 0) output_channel = 256 else: new_conv = None output_channel = in_channels resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.head = head self.conv = new_conv self.pooler = pooler input_size = output_channel * resolution**2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN self.fc6 = make_fc(input_size, representation_size, use_gn) self.fc7 = make_fc(representation_size, representation_size, use_gn) self.out_channels = representation_size
def __init__(self, cfg, in_channels): super(FPN2MLPFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) input_size = in_channels * resolution**2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN self.pooler = pooler self.fc6 = make_fc(input_size, representation_size, use_gn) self.fc7 = make_fc(representation_size, representation_size, use_gn) self.out_channels = representation_size
def __init__(self, cfg, in_channels): super(FPNXconv1fcFeatureExtractor, self).__init__() resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.pooler = pooler use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION xconvs = [] for ix in range(num_stacked_convs): xconvs.append( nn.Conv2d(in_channels, conv_head_dim, kernel_size=3, stride=1, padding=dilation, dilation=dilation, bias=False if use_gn else True)) in_channels = conv_head_dim if use_gn: xconvs.append(group_norm(in_channels)) xconvs.append(nn.ReLU(inplace=True)) self.add_module("xconvs", nn.Sequential(*xconvs)) for modules in [ self.xconvs, ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) if not use_gn: torch.nn.init.constant_(l.bias, 0) input_size = conv_head_dim * resolution**2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM self.fc6 = make_fc(input_size, representation_size, use_gn=False) self.out_channels = representation_size
def __init__(self, cfg, in_channels): super(MEGAFeatureExtractor, self).__init__(cfg, in_channels) stage = resnet.StageSpec(index=4, block_count=3, return_features=False) head = resnet.ResNetHead( block_module=cfg.MODEL.RESNETS.TRANS_FUNC, stages=(stage, ), num_groups=cfg.MODEL.RESNETS.NUM_GROUPS, width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP, stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1, stride_init=1, res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS, dilation=cfg.MODEL.RESNETS.RES5_DILATION, ) in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index - 1) if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL: new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1) nn.init.kaiming_uniform_(new_conv.weight, a=1) nn.init.constant_(new_conv.bias, 0) output_channel = 256 else: new_conv = None output_channel = in_channels resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.head = head self.conv = new_conv self.pooler = pooler input_size = output_channel * resolution**2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN self.all_frame_interval = cfg.MODEL.VID.MEGA.ALL_FRAME_INTERVAL if cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ENABLE: self.embed_dim = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.EMBED_DIM self.groups = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.GROUP self.feat_dim = representation_size self.stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.STAGE self.base_num = cfg.MODEL.VID.RPN.REF_POST_NMS_TOP_N self.advanced_num = int(self.base_num * cfg.MODEL.VID.MEGA.RATIO) fcs, Wgs, Wqs, Wks, Wvs, us = [], [], [], [], [], [] for i in range(self.stage): r_size = input_size if i == 0 else representation_size fcs.append(make_fc(r_size, representation_size, use_gn)) Wgs.append( Conv2d(self.embed_dim, self.groups, kernel_size=1, stride=1, padding=0)) Wqs.append(make_fc(self.feat_dim, self.feat_dim)) Wks.append(make_fc(self.feat_dim, self.feat_dim)) Wvs.append( Conv2d(self.feat_dim * self.groups, self.feat_dim, kernel_size=1, stride=1, padding=0, groups=self.groups)) us.append( nn.Parameter(torch.Tensor(self.groups, 1, self.embed_dim))) for l in [Wgs[i], Wvs[i]]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) for weight in [us[i]]: torch.nn.init.normal_(weight, std=0.01) self.l_fcs = nn.ModuleList(fcs) self.l_Wgs = nn.ModuleList(Wgs) self.l_Wqs = nn.ModuleList(Wqs) self.l_Wks = nn.ModuleList(Wks) self.l_Wvs = nn.ModuleList(Wvs) self.l_us = nn.ParameterList(us) # Long Range Memory self.memory_enable = cfg.MODEL.VID.MEGA.MEMORY.ENABLE if self.memory_enable: self.memory_size = cfg.MODEL.VID.MEGA.MEMORY.SIZE # Global Aggregation Stage self.global_enable = cfg.MODEL.VID.MEGA.GLOBAL.ENABLE if self.global_enable: self.global_size = cfg.MODEL.VID.MEGA.GLOBAL.SIZE self.global_res_stage = cfg.MODEL.VID.MEGA.GLOBAL.RES_STAGE Wqs, Wks, Wvs, us = [], [], [], [] for i in range(self.global_res_stage + 1): Wqs.append(make_fc(self.feat_dim, self.feat_dim)) Wks.append(make_fc(self.feat_dim, self.feat_dim)) Wvs.append( Conv2d(self.feat_dim * self.groups, self.feat_dim, kernel_size=1, stride=1, padding=0, groups=self.groups)) us.append( nn.Parameter(torch.Tensor(self.groups, 1, self.embed_dim))) for l in [Wvs[i]]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) for weight in [us[i]]: torch.nn.init.normal_(weight, std=0.01) self.g_Wqs = nn.ModuleList(Wqs) self.g_Wks = nn.ModuleList(Wks) self.g_Wvs = nn.ModuleList(Wvs) self.g_us = nn.ParameterList(us) self.out_channels = representation_size
def __init__(self, cfg, in_channels): super(RDNFeatureExtractor, self).__init__(cfg, in_channels) stage = resnet.StageSpec(index=4, block_count=3, return_features=False) head = resnet.ResNetHead( block_module=cfg.MODEL.RESNETS.TRANS_FUNC, stages=(stage, ), num_groups=cfg.MODEL.RESNETS.NUM_GROUPS, width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP, stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1, stride_init=1, res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS, dilation=cfg.MODEL.RESNETS.RES5_DILATION, ) in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index - 1) if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL: new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1) nn.init.kaiming_uniform_(new_conv.weight, a=1) nn.init.constant_(new_conv.bias, 0) output_channel = 256 else: new_conv = None output_channel = in_channels resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler = Pooler( output_size=(resolution, resolution), scales=scales, sampling_ratio=sampling_ratio, ) self.head = head self.conv = new_conv self.pooler = pooler input_size = output_channel * resolution**2 representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN if cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ENABLE: self.embed_dim = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.EMBED_DIM self.groups = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.GROUP self.feat_dim = representation_size self.base_stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.STAGE self.advanced_stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ADVANCED_STAGE self.base_num = cfg.MODEL.VID.RPN.REF_POST_NMS_TOP_N self.advanced_num = int(self.base_num * cfg.MODEL.VID.RDN.RATIO) fcs, Wgs, Wqs, Wks, Wvs = [], [], [], [], [] for i in range(self.base_stage + self.advanced_stage + 1): r_size = input_size if i == 0 else representation_size if i == self.base_stage and self.advanced_stage == 0: break if i != self.base_stage + self.advanced_stage: fcs.append(make_fc(r_size, representation_size, use_gn)) Wgs.append( Conv2d(self.embed_dim, self.groups, kernel_size=1, stride=1, padding=0)) Wqs.append(make_fc(self.feat_dim, self.feat_dim)) Wks.append(make_fc(self.feat_dim, self.feat_dim)) Wvs.append( Conv2d(self.feat_dim * self.groups, self.feat_dim, kernel_size=1, stride=1, padding=0, groups=self.groups)) for l in [Wgs[i], Wvs[i]]: torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) self.fcs = nn.ModuleList(fcs) self.Wgs = nn.ModuleList(Wgs) self.Wqs = nn.ModuleList(Wqs) self.Wks = nn.ModuleList(Wks) self.Wvs = nn.ModuleList(Wvs) self.out_channels = representation_size