def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ _, _, img_h, img_w = x.size() cfg._tmp_img_h = img_h cfg._tmp_img_w = img_w #A with timer.env('backbone'): outs = self.backbone(x) # outs[0].size() = (n, 256, 138, 138) # outs[1].size() = (n, 512, 69, 69) # outs[2].size() = (n, 1024, 34, 34) # outs[3].size() = (n, 2048, 17, 17) #B if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers # 백본을 생성하는 과정에서 selected_layers가 새로 정의 되었음을 주의! # outs = outs[1, 2, 3] outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) # outs[0] #(n, 256, 69, 69) # outs[1] #(n, 256, 34, 34) # outs[2] #(n, 256, 17, 17) # outs[3] #(n, 256, 8, 8) # outs[4] #(n, 256, 4, 4) #C proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: #True with timer.env('proto'): proto_x = x if self.proto_src is None else outs[ self.proto_src] # proto_x = outs[0] = P3 if self.num_grids > 0: # self.num_grids = 0 grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x = torch.cat([proto_x, grids], dim=1) # proto_x = (n, 256, 69, 69) proto_out = self.proto_net( proto_x) # P3를 proto_net에 통과. -> (n, 32, 138, 138) proto_out = cfg.mask_proto_prototype_activation( proto_out) #relu if cfg.mask_proto_prototypes_as_features: #False # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous( ) # (n, h, w, channel)로 변환. ->(n, 138, 138, 32) if cfg.mask_proto_bias: #False bias_shape = [x for x in proto_out.size()] bias_shape[-1] = 1 proto_out = torch.cat( [proto_out, torch.ones(*bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_mask_scoring: #False pred_outs['score'] = [] if cfg.use_instance_coeff: #False pred_outs['inst'] = [] for idx, pred_layer in zip( self.selected_layers, self.prediction_layers): # 5개 나오는 prediction layers pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: #False # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = F.interpolate( proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) pred_x = torch.cat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: #1번째 꺼 제외 True. pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) # 각 배치 사이즈 별로 prior_box를 모두 합침. # #'prior'이 변하지 않는 스케일과 ratio를 가지고 있으므로 index를 유지하면 합쳐도 ok for k, v in pred_outs.items(): pred_outs[k] = torch.cat(v, -2) # 추출한 prototype도 pred_outs에 집어넣음. if proto_out is not None: pred_outs['proto'] = proto_out if self.training: #None # For the extra loss functions if cfg.use_class_existence_loss: #False pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) if cfg.use_semantic_segmentation_loss: #True pred_outs['segm'] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_mask_scoring: #False pred_outs['score'] = torch.sigmoid(pred_outs['score']) if cfg.use_focal_loss: #False if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) if cfg.use_mask_scoring: pred_outs['conf'] *= pred_outs['score'] elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax( pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) else: if cfg.use_objectness_score: #False objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = (objectness > 0.10)[..., None] \ * F.softmax(pred_outs['conf'][:, :, 1:], dim=-1) else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) return self.detect(pred_outs, self)
def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ with timer.env('backbone'): outs = self.backbone(x) if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: with timer.env('proto'): proto_x = x if self.proto_src is None else outs[self.proto_src] if self.num_grids > 0: grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x = torch.cat([proto_x, grids], dim=1) proto_out = self.proto_net(proto_x) proto_out = cfg.mask_proto_prototype_activation(proto_out) if cfg.mask_proto_prototypes_as_features: # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous() if cfg.mask_proto_bias: bias_shape = [x for x in proto_out.size()] bias_shape[-1] = 1 proto_out = torch.cat( [proto_out, torch.ones(*bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_instance_coeff: pred_outs['inst'] = [] for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = F.interpolate( proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) pred_x = torch.cat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) for k, v in pred_outs.items(): pred_outs[k] = torch.cat(v, -2) if proto_out is not None: pred_outs['proto'] = proto_out if self.training: # For the extra loss functions if cfg.use_class_existence_loss: pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) if cfg.use_semantic_segmentation_loss: pred_outs['segm'] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax( pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) return self.detect(pred_outs)
def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ # plt.imshow(x.permute(0,2,3,1)[0,:,:,:].detach().cpu().numpy()) # plt.savefig('visual_test/input.png') # plt.cla() with timer.env('backbone'): outs = self.backbone(x) if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.fpn_fusion is True: fusion_maps = self.fusion_module( outs[:self.fusion_layers] ) # fusion all levels feature map from map into single one if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: with timer.env('proto'): proto_x = x if self.proto_src is None else outs[self.proto_src] # FPN Fusion if cfg.proto_src_fusion is True: proto_x = fusion_maps if cfg.cross_attention_fusion is True: P_query = outs[0] proto_x = P_query for layer in range(self.fusion_layers): z = self.CALayer(x_query=P_query, x_key=outs[layer]) - P_query proto_x = proto_x + z if self.num_grids > 0: grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x = torch.cat([proto_x, grids], dim=1) if cfg.proto_coordconv: proto_x = self.addcoords(proto_x) proto_out = self.proto_net(proto_x) proto_out = cfg.mask_proto_prototype_activation(proto_out) if cfg.mask_proto_prototypes_as_features: # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous() if cfg.mask_proto_bias: bias_shape = [x for x in proto_out.size()] bias_shape[-1] = 1 proto_out = torch.cat( [proto_out, torch.ones(*bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_instance_coeff: pred_outs['inst'] = [] for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = F.interpolate( proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) pred_x = torch.cat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: pred_layer.parent = [self.prediction_layers[0]] if cfg.ins_coordconv: pred_x = self.addcoords(pred_x) p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) # ===revised=== num_priors = [] for k, v in pred_outs.items(): if k == 'loc': for _v in v: num_priors.append(_v.size(1)) pred_outs[k] = torch.cat(v, -2) pred_outs['layer'] = num_priors if proto_out is not None: pred_outs['proto'] = proto_out if self.training: # For the extra loss functions if cfg.use_class_existence_loss: pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) with timer.env('segm'): if cfg.use_semantic_segmentation_loss: sem_in = None if cfg.sem_src_fusion is True: sem_in = fusion_maps elif cfg.sem_lincomb is True: sem_in = outs[-1] if cfg.sem_coordconv: sem_in = self.addcoords(sem_in) pred_outs['segm'] = self.semantic_seg_conv(sem_in) # pred_outs['segm'] = self.semantic_seg_conv(outs[-1]) #lincomb version return pred_outs else: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax( pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) if cfg.use_sem_output is True: sem_in = None if cfg.sem_src_fusion is True: sem_in = fusion_maps elif cfg.sem_lincomb is True: sem_in = outs[-1] if cfg.sem_coordconv: sem_in = self.addcoords(sem_in) pred_outs['segm'] = self.semantic_seg_conv(sem_in) return self.detect(pred_outs)