def __init__(self, args): super(ModelPaperBaselineN_batch_lambda, self).__init__() self.args = args self.word_size = args.word_size self.layer0 = LambdaLayer( dim=len(self.args.inputs_type), # channels going in dim_out=args.out_channel0, # channels out n=args.word_size * args.Nbatch, # number of input pixels (64 x 64 image) dim_k=16, # key dimension heads=4, # number of heads, for multi-query dim_u=1 # 'intra-depth' dimension ) self.BN0 = nn.BatchNorm2d(args.out_channel0, eps=0.01, momentum=0.99) self.layers_conv = nn.ModuleList() self.layers_batch = nn.ModuleList() self.numLayers = args.numLayers for i in range(args.numLayers - 1): self.layers_conv.append(LambdaLayer( dim=args.out_channel1, # channels going in dim_out=args.out_channel1, # channels out n=args.word_size * args.Nbatch, # number of input pixels (64 x 64 image) dim_k=16, # key dimension heads=4, # number of heads, for multi-query dim_u=1 # 'intra-depth' dimension )) self.layers_batch.append(nn.BatchNorm2d(args.out_channel1, eps=0.01, momentum=0.99)) self.fc1 = nn.Linear(args.out_channel1 * args.word_size * args.Nbatch, args.hidden1) # 6*6 from image dimension self.BN5 = nn.BatchNorm1d(args.hidden1, eps=0.01, momentum=0.99) self.fc2 = nn.Linear(args.hidden1, args.hidden1) self.BN6 = nn.BatchNorm1d(args.hidden1, eps=0.01, momentum=0.99) self.fc3 = nn.Linear(args.hidden1, 1)
def lambda_conv(in_channels, out_channels, kernel_size, bias=True, dilation=1): return LambdaLayer(dim=in_channels, dim_out=out_channels, r=23, dim_k=16, heads=get_heads_count(out_channels), dim_u=1)
def lambda_conv(in_channels, out_channels, **kwargs): return LambdaLayer(dim=in_channels, dim_out=out_channels, r=23, dim_k=16, heads=get_heads_count(out_channels), dim_u=1)
def __init__( self, in_channels: int, out_channels: int, kernel_size: int, switch_breadth: int, stride: int = 1, padding: int = 0, dilation: int = 1, groups: int = 1, bias: bool = True, padding_mode: str = 'zeros', include_coupler: bool = False, # A 'coupler' is a latent converter which can make any bxcxhxw tensor a compatible switchedconv selector by performing a linear 1x1 conv, softmax and interpolate. coupler_mode: str = 'standard', coupler_dim_in: int = 0): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.padding_mode = padding_mode self.groups = groups if include_coupler: if coupler_mode == 'standard': self.coupler = Conv2d(coupler_dim_in, switch_breadth, kernel_size=1) elif coupler_mode == 'lambda': self.coupler = LambdaLayer(dim=coupler_dim_in, dim_out=switch_breadth, r=23, dim_k=16, heads=2, dim_u=1) else: self.coupler = None self.weights = nn.ParameterList([ nn.Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size)) for _ in range(switch_breadth) ]) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters()
def __init__(self): super().__init__() self.layer1 = LambdaLayer( dim=3, # channels going in dim_out=16, # channels out r=23, # the receptive field for relative positional encoding (23 x 23) dim_k=32, # key dimension heads=1, # number of heads, for multi-query dim_u=4 # 'intra-depth' dimension ) self.layer2 = LambdaLayer( dim=16, # channels going in dim_out=3, # channels out r=15, # the receptive field for relative positional encoding (23 x 23) dim_k=16, # key dimension heads=1, # number of heads, for multi-query dim_u=4 # 'intra-depth' dimension ) self.last_conv = torch.nn.Conv2d(3, 3, 1, bias=False)
def __init__(self, in_channels, middle_channels, out_channels): super(DecoderLambda, self).__init__() self.block = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True), nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), nn.BatchNorm2d(out_channels), # nn.ReLU(inplace=True), ) self.lambda_layer = LambdaLayer( dim=out_channels, dim_out=out_channels, r=7, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=1, dim_u=1) self._initialize_weights()
def conv_dw(inp: torch.Tensor, oup: torch.Tensor, stride: int, layer_type: str) -> nn.Sequential: if layer_type == "c": return nn.Sequential( nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True), nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), nn.ReLU(inplace=True), ) elif layer_type == "l": return LambdaLayer( dim=inp, dim_out=oup, r=3, dim_k=4, heads=4, dim_u=1, ) else: return Identity()
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features: torch.Tensor, pe_dim=0): super(DecoderSparse, self).__init__() # fmt: off self.in_features = in_features feature_strides = {k: v.stride for k, v in input_shape.items()} feature_channels = {k: v.channels for k, v in input_shape.items()} # num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES num_classes = 75 conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R self.use_agg_feat = cfg.MODEL.CONDINST.IUVHead.USE_AGG_FEATURES self.use_ins_gn = cfg.MODEL.CONDINST.IUVHead.INSTANCE_AWARE_GN self.checkpoint_grad_num = cfg.MODEL.CONDINST.CHECKPOINT_GRAD_NUM agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS self.use_aux_global_s = cfg.MODEL.CONDINST.AUX_SUPERVISION_GLOBAL_S self.use_aux_global_skeleton = cfg.MODEL.CONDINST.AUX_SUPERVISION_GLOBAL_SKELETON self.use_aux_body_semantics = cfg.MODEL.CONDINST.AUX_SUPERVISION_BODY_SEMANTICS if self.use_aux_global_s: num_classes += 1 if self.use_aux_global_skeleton: "to check" num_classes += 55 if self.use_aux_body_semantics: num_classes += 15 self.predictor_conv_type = cfg.MODEL.CONDINST.IUVHead.PREDICTOR_TYPE self.use_dropout = cfg.MODEL.CONDINST.IUVHead.DROPOUT self.use_san = cfg.MODEL.CONDINST.IUVHead.USE_SAN self.san_type = cfg.MODEL.CONDINST.SAN_TYPE # fmt: on # if not self.use_agg_feat: # self.scale_heads = [] # for in_feature in self.in_features: # head_ops = [] # head_length = max( # 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) # ) # for k in range(head_length): # conv = Conv2d( # feature_channels[in_feature] if k == 0 else conv_dims, # conv_dims, # kernel_size=3, # stride=1, # padding=1, # bias=not norm, # norm=get_norm(norm, conv_dims), # activation=F.relu, # ) # weight_init.c2_msra_fill(conv) # head_ops.append(conv) # if feature_strides[in_feature] != self.common_stride: # head_ops.append( # nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) # ) # self.scale_heads.append(nn.Sequential(*head_ops)) # self.add_module(in_feature, self.scale_heads[-1]) # self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) if num_lambda_layer>0: self.comb_pe_conv = LambdaLayer( dim = agg_channels+pe_dim, dim_out = agg_channels, r = lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k = 16, heads = 4, dim_u = 4 ) else: self.comb_pe_conv = Conv2d( agg_channels+pe_dim, agg_channels, kernel_size=3, stride=1, padding=1, bias=not norm, norm=get_norm(norm, agg_channels), activation=F.relu, ) if self.use_san: # sa_type = 1 ## 0: pairwise; 1: patchwise sa_type = 1 if self.san_type=="SAN_BottleneckGN": san_func = SAN_BottleneckGN elif self.san_type=="SAN_BottleneckGN_GatedEarly": san_func = SAN_BottleneckGN_GatedEarly elif self.san_type=="SAN_BottleneckGN_Gated": san_func = SAN_BottleneckGN_Gated self.san_blk_1 = san_func(sa_type, agg_channels, agg_channels // 16, agg_channels // 4, agg_channels, 8, kernel_size=7, stride=1) # weight_init.c2_msra_fill(self.comb_pe_conv) if self.use_dropout: self.dropout_layer = nn.Dropout2d(0.25) self.densepose_head = build_densepose_head(cfg, agg_channels) if self.predictor_conv_type=="conv": self.predictor = Conv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, 1, stride=1, padding=0 ) initialize_module_params(self.predictor) elif self.predictor_conv_type=="dcnv1": self.predictor = deform_conv.DFConv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, with_modulated_dcn=False, kernel_size=3 ) elif self.predictor_conv_type=="dcnv2": self.predictor = deform_conv.DFConv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, with_modulated_dcn=True, kernel_size=3 ) elif self.predictor_conv_type=="dcnv2Conv": self.predictor = [] self.predictor.append(deform_conv.DFConv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, with_modulated_dcn=True, kernel_size=3 )) self.predictor.append(Conv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, 1, stride=1, padding=0 )) initialize_module_params(self.predictor[-1]) self.predictor = nn.Sequential(*self.predictor) elif self.predictor_conv_type=="dcnv2ResConv": self.predictor = [] self.predictor.append(deform_conv.DeformBottleneckBlock( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, bottleneck_channels=cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, deform_modulated=True )) self.predictor.append(Conv2d( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, 1, stride=1, padding=0 )) initialize_module_params(self.predictor[-1]) self.predictor = nn.Sequential(*self.predictor) elif self.predictor_conv_type=="sparse": # self.predictor = nn.Identity() conv = sparse_conv_with_kaiming_uniform(norm=None, activation=None, use_sep=False, use_submconv=True, use_deconv=False) self.predictor = conv( cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, kernel_size=3, stride=1, dilation=1, indice_key="subm0", )
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R num_dcn_layer = cfg.MODEL.CONDINST.IUVHead.NUM_DCN_LAYER assert num_lambda_layer<=num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS self.use_down_up_sampling = cfg.MODEL.CONDINST.IUVHead.DOWN_UP_SAMPLING self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs>0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder(multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = agg_channels + self.position_emb_dim else: self.in_channels = agg_channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 conv_block = conv_with_kaiming_uniform(norm, activation=True) partial_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_partial_conv=True) deform_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_deformable=True) tower = [] if self.use_partial_conv: # pdb.set_trace() layer = partial_conv_block(self.in_channels, channels, 3, 1) tower.append(layer) self.in_channels = channels if num_lambda_layer>0: layer = LambdaLayer( dim = self.in_channels, dim_out = channels, r = lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k = 16, heads = 4, dim_u = 4 ) tower.append(layer) else: tower.append(conv_block( self.in_channels, channels, 3, 1 )) if num_dcn_layer>0: tower.append(deform_conv_block( channels, channels, 3, 1 )) if self.use_down_up_sampling: for i in range(1,num_convs): if i==1: tower.append(conv_block( channels, channels*2, 3, 2 )) else: tower.append(conv_block( channels*2, channels*2, 3, 1 )) tower.append(ConvTranspose2d( channels*2, self.num_outputs, 4, stride=2, padding=int(4 / 2 - 1) )) else: for i in range(1,num_convs): tower.append(conv_block( channels, channels, 3, 1 )) tower.append(nn.Conv2d( channels, max(self.num_outputs, 1), 1 )) self.add_module('tower', nn.Sequential(*tower))
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R assert num_lambda_layer <= num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 extra_channels = 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) extra_channels += self.position_emb_dim else: extra_channels += 2 if self.use_abs_coords: if self.use_pos_emb: extra_channels += self.position_emb_dim else: extra_channels += 2 # pdb.set_trace() conv_block = conv_with_kaiming_uniform(norm, activation=True) cnt = 0 self.layers = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=agg_channels + extra_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) else: layer = conv_block(channels + extra_channels, channels, 3, 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer) cnt += 1 for i in range(1, num_convs): if i < num_lambda_layer: layer = LambdaLayer( dim=channels + extra_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) else: layer = conv_block(channels + extra_channels, channels, 3, 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer) cnt += 1 layer = nn.Conv2d(channels + extra_channels, max(self.num_outputs, 1), 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer)
def __init__( self, in_c, out_c, kernel_sz, breadth, stride=1, bias=True, dropout_rate=0.0, include_coupler: bool = False, # A 'coupler' is a latent converter which can make any bxcxhxw tensor a compatible switchedconv selector by performing a linear 1x1 conv, softmax and interpolate. coupler_mode: str = 'standard', coupler_dim_in: int = 0, hard_en=True, # A test switch that, when used in 'emulation mode' (where all convs are calculated using torch functions) computes soft-attention instead of hard-attention. emulate_swconv=True, # When set, performs a nn.Conv2d operation for each breadth. When false, uses the native cuda implementation which computes all switches concurrently. ): super().__init__() self.in_channels = in_c self.out_channels = out_c self.kernel_size = kernel_sz self.stride = stride self.has_bias = bias self.breadth = breadth self.dropout_rate = dropout_rate if include_coupler: if coupler_mode == 'standard': self.coupler = Conv2d(coupler_dim_in, breadth, kernel_size=1, stride=self.stride) elif coupler_mode == 'lambda': self.coupler = nn.Sequential( nn.Conv2d(coupler_dim_in, coupler_dim_in, 1), nn.BatchNorm2d(coupler_dim_in), nn.ReLU(), LambdaLayer(dim=coupler_dim_in, dim_out=breadth, r=23, dim_k=16, heads=2, dim_u=1), nn.BatchNorm2d(breadth), nn.ReLU(), Conv2d(breadth, breadth, 1, stride=self.stride)) elif coupler_mode == 'lambda2': self.coupler = nn.Sequential( nn.Conv2d(coupler_dim_in, coupler_dim_in, 1), nn.GroupNorm(num_groups=2, num_channels=coupler_dim_in), nn.ReLU(), LambdaLayer(dim=coupler_dim_in, dim_out=coupler_dim_in, r=23, dim_k=16, heads=2, dim_u=1), nn.GroupNorm(num_groups=2, num_channels=coupler_dim_in), nn.ReLU(), LambdaLayer(dim=coupler_dim_in, dim_out=breadth, r=23, dim_k=16, heads=2, dim_u=1), nn.GroupNorm(num_groups=1, num_channels=breadth), nn.ReLU(), Conv2d(breadth, breadth, 1, stride=self.stride)) else: self.coupler = None self.gate = HardRoutingGate(breadth, hard_en=True) self.hard_en = hard_en self.weight = nn.Parameter( torch.empty(out_c, in_c, breadth, kernel_sz, kernel_sz)) if bias: self.bias = nn.Parameter(torch.empty(out_c)) else: self.bias = torch.zeros(out_c) self.reset_parameters()
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features: torch.Tensor, pe_dim=0): super(Decoder, self).__init__() # fmt: off self.in_features = in_features feature_strides = {k: v.stride for k, v in input_shape.items()} feature_channels = {k: v.channels for k, v in input_shape.items()} # num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES num_classes = 75 conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R self.use_ins_gn = cfg.MODEL.CONDINST.IUVHead.INSTANCE_AWARE_GN # fmt: on self.scale_heads = [] for in_feature in self.in_features: head_ops = [] head_length = max( 1, int( np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))) for k in range(head_length): conv = Conv2d( feature_channels[in_feature] if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=get_norm(norm, conv_dims), activation=F.relu, ) weight_init.c2_msra_fill(conv) head_ops.append(conv) if feature_strides[in_feature] != self.common_stride: head_ops.append( nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)) self.scale_heads.append(nn.Sequential(*head_ops)) self.add_module(in_feature, self.scale_heads[-1]) # self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) if num_lambda_layer > 0: self.comb_pe_conv = LambdaLayer( dim=conv_dims + pe_dim, dim_out=conv_dims, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) else: self.comb_pe_conv = Conv2d( conv_dims + pe_dim, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=get_norm(norm, conv_dims), activation=F.relu, ) # weight_init.c2_msra_fill(self.comb_pe_conv) self.densepose_head = build_densepose_head(cfg, conv_dims) self.predictor = Conv2d(cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM, num_classes, 1, stride=1, padding=0) initialize_module_params(self.predictor)
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R assert num_lambda_layer <= num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = agg_channels + self.position_emb_dim else: self.in_channels = agg_channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 if self.use_partial_conv: conv_block = conv_with_kaiming_uniform(norm, activation=True, use_partial_conv=True) else: conv_block = conv_with_kaiming_uniform(norm, activation=True) # pdb.set_trace() conv_block_bn = conv_with_kaiming_uniform("BN", activation=True) # tower_attn = [] # tower_attn.append(conv_block_bn( # self.position_emb_dim, 32, 3, 1 # )) # tower_attn.append(nn.Conv2d( # 32, 3, 3, stride=1, padding=1 # )) # self.add_module('tower_attn', nn.Sequential(*tower_attn)) num_layer = 3 tower0 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower0.append(layer) else: tower0.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower0.append(conv_block(channels, channels, 3, 1)) self.add_module('tower0', nn.Sequential(*tower0)) tower1 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower1.append(layer) else: tower1.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower1.append(conv_block(channels, channels, 3, 1)) self.add_module('tower1', nn.Sequential(*tower1)) tower2 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower2.append(layer) else: tower2.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower2.append(conv_block(channels, channels, 3, 1)) self.add_module('tower2', nn.Sequential(*tower2)) tower_out = [] for i in range(num_convs - num_layer - 1): if i == 0: tower_out.append(conv_block(channels * 3, channels, 1, 1)) else: tower_out.append(conv_block(channels, channels, 3, 1)) self.add_module('tower_out', nn.Sequential(*tower_out))
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER assert num_lambda_layer <= num_convs channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = channels + self.position_emb_dim else: self.in_channels = channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 conv_block = conv_with_kaiming_uniform(norm, activation=True) tower = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=23, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) tower.append(layer) else: tower.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(1, num_convs - 1): if i < num_lambda_layer: layer = LambdaLayer( dim=channels, dim_out=channels, r=23, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) tower.append(layer) else: tower.append(conv_block(channels, channels, 3, 1)) self.add_module('tower', nn.Sequential(*tower)) self.mid_res_conv = conv_block(channels, channels, 3, 1) self.mid_res_out = nn.Conv2d(channels, self.num_outputs, 1) self.low_res_conv = conv_block(channels, channels, 3, 2) self.low_res_out = nn.Conv2d(channels, self.num_outputs, 1) deconv_block = conv_with_kaiming_uniform(norm, activation=True, use_deconv=True) self.high_res_conv = deconv_block(channels, channels, 3, 2) self.high_res_out = nn.Conv2d(channels, self.num_outputs, 1)