def detect_loss(self, cls_score, rois_label, bbox_pred, rois_target, rois_inside_ws, rois_outside_ws): # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) return RCNN_loss_cls, RCNN_loss_bbox
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1,2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1,2,3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: # pooled_feat = F.max_pool2d(pooled_feat, 2, 2) # add different sample methods pooled_feat = F.max_pool2d(pooled_feat, cfg.ALIGN_SAMPLE_NUM, cfg.ALIGN_SAMPLE_NUM) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes, domain=None, l=0, loss_start=False): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, domain, self.transfer) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws, domain_label = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) #-----------------------transfer learning----------------------------# #print(domain) dom_loss = 0 valid_index = torch.zeros(0) #base line: transfer == False if self.training and self.transfer: if self.grl: dom_input = ReverseLayerF.apply(pooled_feat, l) else: dom_input = pooled_feat dom_pred = self._domain_classify(dom_input) domain_label = Variable(domain_label.cpu().cuda().view(-1).long()) ############Process Tranfer Loss Weight######### if loss_start: p_target = F.softmax(dom_pred * self.transfer_gamma)[:, 0] domain_label.data = domain_label.data.type( torch.FloatTensor).cuda() l_target = domain_label self.weight = p_target**l_target #drop zero weight valid_index = torch.nonzero(self.weight.data).cuda() if len(valid_index.size()) == 0: valid_index = torch.zeros(1, 1).type(torch.LongTensor).cuda() valid_index = valid_index.squeeze(1) ############################################### ##############DOMAIN LOSS SELECTION########## else: ids = torch.LongTensor(1).cuda() # random select if self.transfer_select == 'RANDOM': perm = torch.randperm(rois.size(1)) ids = perm[:rois.size(1) / 8].cuda() # select positive sample and predicted postive sample elif self.transfer_select == 'CONDITION': ids = torch.range(0, rois.size(1) / 8 - 1) ids = torch.Tensor.long(ids).cuda() # select all postive sample elif self.transfer_select == 'POSITIVE': ids = torch.nonzero(rois_label.data) ids = torch.squeeze(ids).cuda() # select all postive sample elif self.transfer_select == 'BALANCE': ids_p = torch.nonzero(rois_label.data) ids_p = torch.squeeze(ids_p).cuda() ids_n = (rois_label.data == 0).nonzero() ids_n = torch.squeeze(ids_n).cuda() ids_n = ids_n[:ids_p.size(0)] ids = torch.cat((ids_p, ids_n), 0).cuda() # select all sample if self.transfer_select == 'ALL': dom_pred_loss = dom_pred dom_label_loss = domain_label else: dom_pred_loss = dom_pred[ids] dom_label_loss = domain_label[ids] ##########DOMAIN LOSS SELECTION DONE########## dom_loss = F.cross_entropy(dom_pred_loss, dom_label_loss) dom_loss = dom_loss * ( self.transfer_weight.expand_as(dom_loss)) #---------------------transfer learning done-------------------------# # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss if self.transfer and loss_start: rois_label_loss = torch.eye( self.n_classes)[rois_label.data.cpu()].type( torch.FloatTensor) rois_label_loss = Variable(rois_label_loss.cuda()) rois_label_loss = rois_label_loss[valid_index] weight_cls_loss = self.weight.view(rois_label.size(0), 1).repeat( 1, self.n_classes) weight_cls_loss = weight_cls_loss[valid_index] cls_score_loss = cls_score[valid_index] RCNN_loss_cls = F.binary_cross_entropy_with_logits( cls_score_loss, rois_label_loss, weight_cls_loss) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws, True, True, self.weight, valid_index) else: RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, dom_loss
def forward(self, im_data, im_info, gt_boxes, num_boxes, tgt_im_data, tgt_im_info, tgt_gt_boxes, tgt_num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data tgt_im_info = tgt_im_info.data tgt_gt_boxes = tgt_gt_boxes.data tgt_num_boxes = tgt_num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) tgt_base_feat = self.RCNN_base(tgt_im_data) # if it is training phrase, then use ground trubut bboxes for refining if self.training: # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox, rpn_cls_prob, rois_select = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) tgt_rois, tgt_rpn_loss_cls, tgt_rpn_loss_bbox, tgt_rpn_cls_prob, tgt_rois_select = self.RCNN_rpn( tgt_base_feat, tgt_im_info, tgt_gt_boxes, tgt_num_boxes) roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) tgt_roi_data = self.RCNN_proposal_target(tgt_rois, tgt_gt_boxes, tgt_num_boxes) tgt_rois, tgt_rois_label, tgt_rois_target, tgt_rois_inside_ws, tgt_rois_outside_ws = tgt_roi_data tgt_rois_label = Variable(tgt_rois_label.view(-1).long()) tgt_rois_target = Variable( tgt_rois_target.view(-1, tgt_rois_target.size(2))) tgt_rois_inside_ws = Variable( tgt_rois_inside_ws.view(-1, tgt_rois_inside_ws.size(2))) tgt_rois_outside_ws = Variable( tgt_rois_outside_ws.view(-1, tgt_rois_outside_ws.size(2))) else: # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) tgt_rois, tgt_rpn_loss_cls, tgt_rpn_loss_bbox = self.RCNN_rpn( tgt_base_feat, tgt_im_info, tgt_gt_boxes, tgt_num_boxes) rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 tgt_rois_label = None tgt_rois_target = None tgt_rois_inside_ws = None tgt_rois_outside_ws = None tgt_rpn_loss_cls = 0 tgt_rpn_loss_bbox = 0 rois = Variable(rois) tgt_rois = Variable(tgt_rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # for RCNN grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) tgt_grid_xy = _affine_grid_gen(tgt_rois.view(-1, 5), tgt_base_feat.size()[2:], self.grid_size) tgt_grid_yx = torch.stack( [tgt_grid_xy.data[:, :, :, 1], tgt_grid_xy.data[:, :, :, 0]], 3).contiguous() tgt_pooled_feat = self.RCNN_roi_crop( tgt_base_feat, Variable(tgt_grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: tgt_pooled_feat = F.max_pool2d(tgt_pooled_feat, 2, 2) # for RPN adaptive loss if self.training: grid_xy_ = _affine_grid_gen(rois_select, base_feat.size()[2:], self.grid_size) grid_yx_ = torch.stack( [grid_xy_.data[:, :, :, 1], grid_xy_.data[:, :, :, 0]], 3).contiguous() pooled_feat_ = self.RCNN_roi_crop(base_feat, Variable(grid_yx_).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_ = F.max_pool2d(pooled_feat_, 2, 2) tgt_grid_xy_ = _affine_grid_gen(tgt_rois_select, tgt_base_feat.size()[2:], self.grid_size) tgt_grid_yx_ = torch.stack([ tgt_grid_xy_.data[:, :, :, 1], tgt_grid_xy_.data[:, :, :, 0] ], 3).contiguous() tgt_pooled_feat_ = self.RCNN_roi_crop( tgt_base_feat, Variable(tgt_grid_yx_).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: tgt_pooled_feat_ = F.max_pool2d(tgt_pooled_feat_, 2, 2) elif cfg.POOLING_MODE == 'align': # for RCNN pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) tgt_pooled_feat = self.RCNN_roi_align(tgt_base_feat, tgt_rois.view(-1, 5)) # for RPN adaptive loss if self.training: pooled_feat_ = self.RCNN_roi_align(base_feat, rois_select) tgt_pooled_feat_ = self.RCNN_roi_align(tgt_base_feat, tgt_rois_select) elif cfg.POOLING_MODE == 'pool': # for RCNN pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) tgt_pooled_feat = self.RCNN_roi_pool(tgt_base_feat, tgt_rois.view(-1, 5)) # for RPN adaptive loss if self.training: pooled_feat_ = self.RCNN_roi_pool(base_feat, rois_select) tgt_pooled_feat_ = self.RCNN_roi_pool(tgt_base_feat, tgt_rois_select) # get the adaptive feature for RPN if self.training: rpn_adapt_feat = self.rpn_adapt_feat( pooled_feat_.view(pooled_feat.size(0), -1)) tgt_rpn_adapt_feat = self.rpn_adapt_feat( tgt_pooled_feat_.view(tgt_pooled_feat.size(0), -1)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) tgt_pooled_feat = self._head_to_tail(tgt_pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) tgt_bbox_pred = self.RCNN_bbox_pred(tgt_pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) tgt_bbox_pred_view = tgt_bbox_pred.view( tgt_bbox_pred.size(0), int(tgt_bbox_pred.size(1) / 4), 4) tgt_bbox_pred_select = torch.gather( tgt_bbox_pred_view, 1, tgt_rois_label.view(tgt_rois_label.size(0), 1, 1).expand(tgt_rois_label.size(0), 1, 4)) tgt_bbox_pred = tgt_bbox_pred_select.squeeze(1) # compute object classification probability adapt_feat = self.RCNN_adapt_feat(pooled_feat) cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) tgt_adapt_feat = self.RCNN_adapt_feat(tgt_pooled_feat) tgt_cls_score = self.RCNN_cls_score(tgt_pooled_feat) tgt_cls_prob = F.softmax(tgt_cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 tgt_RCNN_loss_cls = 0 tgt_RCNN_loss_bbox = 0 RCNN_loss_intra = 0 RCNN_loss_inter = 0 RPN_loss_intra = 0 RPN_loss_inter = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) tgt_RCNN_loss_cls = F.cross_entropy(tgt_cls_score, tgt_rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) tgt_RCNN_loss_bbox = _smooth_l1_loss(tgt_bbox_pred, tgt_rois_target, tgt_rois_inside_ws, tgt_rois_outside_ws) # intra-class and inter-class adaptation loss # pull same classes and push away different classes of source and target domains if self.mode == 'adapt': RCNN_loss_intra, RCNN_loss_inter = self.adaptive_loss( adapt_feat, cls_prob, tgt_adapt_feat, tgt_cls_prob, batch_size) # use gcn to cluster the representation of every class elif self.mode == 'gcn_adapt': RCNN_loss_intra, RCNN_loss_inter = self.gcn_adaptive_loss( adapt_feat, cls_prob, rois, tgt_adapt_feat, tgt_cls_prob, tgt_rois, batch_size) # intra-class and inter-class losses for RPN # pull same classes and push away different classes of source and target domains if self.rpn_mode == 'adapt': RPN_loss_intra, RPN_loss_inter = self.adaptive_loss_rpn( rpn_adapt_feat, rpn_cls_prob, tgt_rpn_adapt_feat, tgt_rpn_cls_prob, batch_size) # use gcn to cluster the representation of every class elif self.rpn_mode == 'gcn_adapt': RPN_loss_intra, RPN_loss_inter = self.gcn_adaptive_loss( rpn_adapt_feat, rpn_cls_prob, rois, tgt_rpn_adapt_feat, tgt_rpn_cls_prob, tgt_rois, batch_size) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) tgt_cls_prob = tgt_cls_prob.view(batch_size, tgt_rois.size(1), -1) tgt_bbox_pred = tgt_bbox_pred.view(batch_size, tgt_rois.size(1), -1) if self.training: return rois, tgt_rois, cls_prob, tgt_cls_prob, bbox_pred, tgt_bbox_pred, rpn_loss_cls.view(-1), tgt_rpn_loss_cls.view(-1), \ rpn_loss_bbox.view(-1), tgt_rpn_loss_bbox.view(-1), RCNN_loss_cls.view(-1), tgt_RCNN_loss_cls.view(-1), RCNN_loss_bbox.view(-1), \ tgt_RCNN_loss_bbox.view(-1), RCNN_loss_intra.view(-1), RCNN_loss_inter.view(-1), rois_label, tgt_rois_label, \ RPN_loss_intra.view(-1), RPN_loss_inter.view(-1) else: return rois, tgt_rois, cls_prob, tgt_cls_prob, bbox_pred, tgt_bbox_pred, rpn_loss_cls, tgt_rpn_loss_cls, rpn_loss_bbox, \ tgt_rpn_loss_bbox, RCNN_loss_cls, tgt_RCNN_loss_cls, RCNN_loss_bbox, tgt_RCNN_loss_bbox, \ RCNN_loss_intra, RCNN_loss_inter, rois_label, tgt_rois_label, RPN_loss_intra, RPN_loss_inter
def forward(self, im_data, query, im_info, gt_boxes, num_boxes, alpha): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map detect_feat = self.RCNN_base(im_data) query_feat = self.RCNN_base_sketch(query) if self.model_type == "match_net": rpn_feat, act_feat, act_aim, c_weight = self.match_net( detect_feat, query_feat) c_weight = None if self.model_type == "attention": act_feat, act_aim, attention_map = self.attention_net( detect_feat, query_feat) act_feat = torch.cat([act_feat, detect_feat], dim=1) act_feat = self.projection(act_feat) if self.model_type == "basic": act_feat = detect_feat act_aim = query_feat if self.model_type in ["basic", "attention"]: rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( act_feat, im_info, gt_boxes, num_boxes) if self.model_type == "match_net": rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( rpn_feat, im_info, gt_boxes, num_boxes) attention_map = None # if it is training phrase, then use ground trubut bboxes for refining if self.training: # if True: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 margin_loss = 0 rpn_loss_bbox = 0 score_label = None rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(act_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(act_feat, rois.view(-1, 5)) pooled_feat = self._head_to_tail(pooled_feat) query_feat = self._head_to_tail(act_aim) batch_size = query_feat.shape[0] # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) pooled_feat = pooled_feat.view(batch_size, rois.size(1), -1) query_feat = query_feat.unsqueeze(1).repeat(1, rois.size(1), 1) pooled_feat = torch.cat( (pooled_feat.expand_as(query_feat), query_feat), dim=2).view(-1, 4096) # compute object classification probability score = self.RCNN_cls_score(pooled_feat) score_prob = F.softmax(score, 1)[:, 1] RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: score_label = rois_label.view(batch_size, -1).float() gt_map = torch.abs( score_label.unsqueeze(1) - score_label.unsqueeze(-1)) score_prob = score_prob.view(batch_size, -1) pr_map = torch.abs( score_prob.unsqueeze(1) - score_prob.unsqueeze(-1)) target = -((gt_map - 1)**2) + gt_map RCNN_loss_cls = F.cross_entropy(score, rois_label) margin_loss = 3 * self.triplet_loss(pr_map, gt_map, target) RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = score_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, margin_loss, RCNN_loss_bbox, rois_label, c_weight, attention_map
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # update 20191026: get the index of nodes in graph for rois (default: batch_size = 1) # if we want to change batch_size, we should consider to change roi2gt_assignment[0] # roi_part_match[0] and roi_part_match_overlap[0] and so onif self.training: # part_threshold = 0.25 # # # first, calculate the overlaps among rois and gt, get the max roi for each gt (node_cls) overlaps = bbox_overlaps_batch(rois, rois)[0] N_node, _ = overlaps.shape node_list = [i for i in range(N_node)] for j in range(N_node): for k in range(N_node): if overlaps[j][k] != 0: overlaps[j][k] = 1 if k == j: overlaps[j][k] = 0 idx_subgraph, vertex_subgraph = subgraph_split(overlaps) # max_overlaps_rois2gt, roi2gt_assignment = torch.max(overlaps, 1) # # # second, calculate the overlaps among rois and rois_select, # # using threshold to select roi for each rois_select (node_part) # # rois_cls_tmp = rois[:, roi2gt_assignment[0], :] # rois_cls_num = np.argwhere(gt_boxes[:, :, 4].cpu().data.numpy()[0] != 0).shape[0] # rois_cls_tmp = rois_cls_tmp[:,:rois_cls_num, :] # rois_cls = rois_cls_tmp.new(rois_cls_tmp.size(0), rois_cls_tmp.size(1), 5).zero_() # rois_cls[:, :, :4] = rois_cls_tmp[:, :, 1:5] # rois_cls[:, :, 4] = rois_cls_tmp[:, :, 0] # # # rois_cls_idx_list is the idx related from rois_cls to rois # roi_cls_idx_list = roi2gt_assignment[0][:rois_cls_num] # # overlaps = bbox_overlaps_batch(rois, rois_cls) # max_overlaps_rois2cls, roi2cls_assignment = torch.max(overlaps, 2) # # roi_part_match_overlap = max_overlaps_rois2cls.cpu().data.numpy() # roi_part_match = roi2cls_assignment.cpu().data.numpy() # # # roi_part_idx_list is the idx related from rois_part to rois # roi_part_idx_list = [] # roi_part_match_idx = np.unique(roi_part_match[0]) # for roi_cls_idx in roi_part_match_idx: # match_idx_tmp = np.transpose(np.argwhere(roi_part_match[0] == roi_cls_idx))[0] # match_overlap_tmp = roi_part_match_overlap[0][match_idx_tmp] # # use threshold to select rois_part # match_idx_tmp_select = np.transpose(np.argwhere(match_overlap_tmp > part_threshold))[0] # match_idx_tmp = match_idx_tmp[match_idx_tmp_select] # roi_part_idx_list.append(torch.from_numpy(match_idx_tmp)) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # # update 20191027: build graph for rois based on index (default: batch_size = 1) # adj_jud = np.zeros((0)) # adj_rois = torch.zeros(0).cuda().long() # for i in range(roi_cls_idx_list.shape[0]): # adj_jud = np.concatenate((adj_jud, [1])) # adj_rois = torch.cat((adj_rois, roi_cls_idx_list[i:i+1])) # try: # adj_jud = np.concatenate((adj_jud, np.zeros((roi_part_idx_list[i].shape[0])))) # adj_rois = torch.cat((adj_rois, roi_part_idx_list[i].cuda())) # except IndexError: # print ('IndexError happen, continue') # continue # # node_cls_idx = np.transpose(np.argwhere(adj_jud == 1))[0] # # adj_matrix_bin = np.zeros((len(adj_jud), len(adj_jud))) # # # link edges for node_cls to node_cls # for k in range(len(node_cls_idx)-1): # idx_node_cls_1 = node_cls_idx[k] # idx_node_cls_2 = node_cls_idx[k + 1] # adj_matrix_bin[idx_node_cls_1, idx_node_cls_2] = 1 # adj_matrix_bin[idx_node_cls_2, idx_node_cls_1] = 1 # # # link edges for node_cls to related node_part # for k in range(len(node_cls_idx)-1): # idx_start = node_cls_idx[k] # idx_end = node_cls_idx[k + 1] # for s in range(idx_start, idx_end): # for t in range(idx_start, idx_end): # if s == t: # adj_matrix_bin[s, t] = 0 # else: # adj_matrix_bin[s, t] = 1 # # calculate the adj_mat based on adj_matrix_bin, the weights on edges are the cosine distance between nodes # adj_matrix = np.zeros((len(adj_jud), len(adj_jud))) # # cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) # # for s in range(len(adj_jud)): # for t in range(len(adj_jud)): # if adj_matrix_bin[s, t] == 1: # node_feat_s = pooled_feat[adj_rois[s], :] # node_feat_t = pooled_feat[adj_rois[t], :] # adj_matrix[s, t] = cos(node_feat_s, node_feat_t) # else: # adj_matrix[s, t] = 0 # # adj_matrix = torch.from_numpy(adj_matrix).float().cuda() # # pooled_feat[adj_rois, :] = F.relu(self.gcn1(pooled_feat[adj_rois, :], adj_matrix)) # pooled_feat[adj_rois, :] = F.relu(self.gcn2(pooled_feat[adj_rois, :], adj_matrix)) # adj_jud = np.zeros((N_node, N_node)) adj_matrix = np.zeros((N_node, N_node)) # # for k in range(idx_subgraph): # idx_k = np.transpose(np.argwhere(vertex_subgraph == k))[0] # for s in range(idx_k.shape[0]): # for t in range(idx_k.shape[0]): # if s == t: # adj_jud[s, t] = 0 # else: # adj_jud[s, t] = 1 # cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) for s in range(N_node): for t in range(N_node): #if adj_jud[s,t] != 0: if s != t: node_feat_s = pooled_feat[s, :] node_feat_t = pooled_feat[t, :] adj_matrix[s, t] = cos(node_feat_s, node_feat_t) adj_matrix = torch.from_numpy(adj_matrix).float().cuda() pooled_feat = F.relu(self.gcn1(pooled_feat, adj_matrix)) pooled_feat = F.relu(self.gcn2(pooled_feat, adj_matrix)) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) # update 2019-6-17:fix the bug for dimension specified as 0... if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data_list, im_info_list, gt_boxes_list, num_boxes_list, average_shot=None, mean_class_attentions=None): # return attentions for testing if average_shot: prn_data = im_data_list[0] # len(metaclass)*4*224*224 attentions = self.prn_network(prn_data) return attentions # extract attentions for training if self.meta_train and self.training: prn_data = im_data_list[0] # len(metaclass)*4*224*224 # feed prn data to prn_network attentions = self.prn_network(prn_data) prn_cls = im_info_list[0] # len(metaclass) im_data = im_data_list[-1] im_info = im_info_list[-1] gt_boxes = gt_boxes_list[-1] num_boxes = num_boxes_list[-1] batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(self.rcnn_conv1(im_data)) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) # if it is training phase, then use ground truth bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) # (b*128)*1024*7*7 elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # (b*128)*2048 # meta training phase if self.meta_train: rcnn_loss_cls = [] rcnn_loss_bbox = [] # pooled feature maps need to operate channel-wise multiplication with # the corresponding class's attentions of every roi of image for b in range(batch_size): zero = Variable(torch.FloatTensor([0]).cuda()) proposal_labels = rois_label[b * 128:(b + 1) * 128].data.cpu().numpy()[0] unique_labels = list(np.unique(proposal_labels)) # the unique rois labels of the input image for i in range(attentions.size(0)): # attentions len(attentions)*2048 if prn_cls[i].numpy()[0] + 1 not in unique_labels: rcnn_loss_cls.append(zero) rcnn_loss_bbox.append(zero) continue roi_feat = pooled_feat[b * cfg.TRAIN.BATCH_SIZE:(b + 1) * cfg.TRAIN.BATCH_SIZE, :] # 128*2048 cls_feat = attentions[i].view(1, -1, 1, 1) # 1*2048*1*1 diff_feat = roi_feat - cls_feat.squeeze() corr_feat = F.conv2d(roi_feat.unsqueeze(-1).unsqueeze(-1), cls_feat.permute(1, 0, 2, 3), groups=2048).squeeze() # subtraction + correlation: [bs, 2048] channel_wise_feat = torch.cat((self.corr_fc(corr_feat), self.diff_fc(diff_feat)), dim=1) # combined with the roi feature: [bs, 2048 * 2] channel_wise_feat = torch.cat((channel_wise_feat, roi_feat), dim=1) # compute object bounding box regression bbox_pred = self.RCNN_bbox_pred(channel_wise_feat) # 128*4 if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) batch_rois_label = rois_label[b * cfg.TRAIN.BATCH_SIZE:(b + 1) * cfg.TRAIN.BATCH_SIZE] bbox_pred_select = torch.gather( bbox_pred_view, 1, batch_rois_label.view( batch_rois_label.size(0), 1, 1).expand(batch_rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(channel_wise_feat) if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label[b * 128:(b + 1) * 128]) rcnn_loss_cls.append(RCNN_loss_cls) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target[b * 128:(b + 1) * 128], rois_inside_ws[b * 128:(b + 1) * 128], rois_outside_ws[b * 128:(b + 1) * 128]) rcnn_loss_bbox.append(RCNN_loss_bbox) # meta attentions loss if self.meta_loss: attentions_score = self.Meta_cls_score(attentions) meta_loss = F.cross_entropy(attentions_score, Variable(torch.cat(prn_cls, dim=0).cuda())) else: meta_loss = 0 return rois, rpn_loss_cls, rpn_loss_bbox, rcnn_loss_cls, rcnn_loss_bbox, rois_label, 0, 0, meta_loss # meta testing phase elif self.meta_test: cls_prob_list = [] bbox_pred_list = [] for i in range(len(mean_class_attentions)): mean_attentions = mean_class_attentions[i] cls_feat = mean_attentions.view(1, -1, 1, 1) # 1*2048*1*1 diff_feat = pooled_feat - cls_feat.squeeze() corr_feat = F.conv2d(pooled_feat.unsqueeze(-1).unsqueeze(-1), cls_feat.permute(1, 0, 2, 3), groups=2048).squeeze() # subtraction + correlation: [bs, 2048] channel_wise_feat = torch.cat((self.corr_fc(corr_feat), self.diff_fc(diff_feat)), dim=1) # combined with the roi feature: [bs, 2048 * 2] channel_wise_feat = torch.cat((channel_wise_feat, pooled_feat), dim=1) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(channel_wise_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(channel_wise_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) cls_prob_list.append(cls_prob) bbox_pred_list.append(bbox_pred) return rois, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, cls_prob_list, bbox_pred_list, 0 # original faster-rcnn implementation else: bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) # 128 * 1001 cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, cls_prob, bbox_pred, 0
def forward(self, rpn_feature_maps, im_info, gt_boxes, num_boxes): n_feat_maps = len(rpn_feature_maps) rpn_cls_scores = [] rpn_cls_probs = [] rpn_bbox_preds = [] rpn_shapes = [] for i in range(n_feat_maps): feat_map = rpn_feature_maps[i] batch_size = feat_map.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(feat_map), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) rpn_shapes.append( [rpn_cls_score.size()[2], rpn_cls_score.size()[3]]) rpn_cls_scores.append( rpn_cls_score.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) rpn_cls_probs.append( rpn_cls_prob.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) rpn_bbox_preds.append( rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)) rpn_cls_score_alls = torch.cat(rpn_cls_scores, 1) rpn_cls_prob_alls = torch.cat(rpn_cls_probs, 1) rpn_bbox_pred_alls = torch.cat(rpn_bbox_preds, 1) n_rpn_pred = rpn_cls_score_alls.size(1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal( (rpn_cls_prob_alls.data, rpn_bbox_pred_alls.data, im_info, cfg_key, rpn_shapes)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target( (rpn_cls_score_alls.data, gt_boxes, im_info, num_boxes, rpn_shapes)) # compute classification loss rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score_alls.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_inside_weights.size(1), 4)) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_outside_weights.size(1), 4)) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred_alls, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes, dl_data): # batch_size = im_data.size(0) batch_size, c, h, w = im_data.size() im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data start_tic = time.time() # Start add by Jie, use mobilenetV2 as the backbone network for feature extraction. if self.dlb: # padding if h % 32 != 0: # 720*1280 -->736*1280 m, n = divmod(h, 32) ph = int(((m+1)*32-h)/2) im_data = F.pad(im_data, (0, 0, ph, ph), "constant", 0) if w % 32 != 0: m, n = divmod(w, 32) pw = int(((m+1)*32-w)/2) im_data = F.pad(im_data, (pw, pw, 0, 0), "constant", 0) # (padLeft, padRight, padTop, padBottom) # print('im_data', im_data.size()) low_level_features = self.RCNN_low_base(im_data) #1/4 # print('low_level_features', low_level_features.size()) mid_level_features = self.RCNN_mid_base(low_level_features) #1/8 base_feat = self.RCNN_base(mid_level_features) #1/16 # print('base_feat', base_feat.size()) base_toc = time.time() # ----- Do segmentation seg_feat = self.RCNN_top(base_feat) # print('seg_feat', seg_feat.size()) # the previous implementation # drive_line = self.SegDecoder(seg_feat, low_level_features) # print('drive_line', drive_line.size()) # TODO here we need to pass all the feature into the decoder drive_line = self.SegDecoder(low_level_features, mid_level_features, base_feat) # print("drive line size", drive_line.size()) if h % 32 != 0: drive_line = drive_line[:, :, ph:h+ph, :] if w % 32 != 0: drive_line = drive_line[:, :, :, pw:h+pw] drive_toc = time.time() # End add else: low_level_features = self.RCNN_low_base(im_data) # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(low_level_features) # print('base_feat.size()', base_feat.size()) # print('drive_line = 0') # drive_line = 0 # ------ No Detection """ rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 RCNN_loss_cls = 0 RCNN_loss_bbox = 0 drive_line_loss = 0 rois = 0 cls_prob = 0 bbox_pred = 0 """ # ------ End: No Detection # ------ With Detection # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) det_toc = time.time() # print('base_time {:.3f}s driveline {:.3f}s detection {:.3f}s\r' \ # .format( base_toc - start_tic, drive_toc - base_toc, det_toc - drive_toc)) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 drive_line_loss = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # Add by Jie, TODO: add resample # print('Calc drive line segmentation loss') # print('faster rcnn: forward, drive_line.shape, dl_data.shape', drive_line.shape, dl_data.shape) neg_rate = 5 resample = True if neg_rate < 100 else False # if resample: # TODO, use torch instead of numpy # target = dl_data # bs, h, w = target.shape # y_true = target.reshape(-1) # y_true_0_dix = torch.where(y_true == 0) # ??? # num_neg = torch.sum(y_true == 0) # num_pos = torch.sum(y_true == 1) # num_ign = min(max(int(num_neg - neg_rate * num_pos), 0), int((num_neg + num_pos) * 0.95)) # inds = torch.multinomial(y_true_0_dix[0], num_ign, replacement=False, out=None) # # inds = np.random.choice(y_true_0_dix[0], num_ign, replace=False) # y_true[inds] = 255 # ignore # y_true = y_true.reshape(bs, h, w) # y_true = torch.from_numpy(y_true).long().cuda() # else: # y_true = dl_data if resample: target = dl_data.cpu().numpy() # print('target.shape', target.shape, np.amax(target)) bs, h, w = target.shape y_true = target.reshape(-1) y_true_0_dix = np.where(y_true == 0) # --- num_neg = np.sum(np.array(y_true == 0)) num_pos = np.sum(np.array(y_true == 1)) # count = np.bincount(y_true) # num_neg = count[0] # num_pos = count[1] # when only have neg sample, count[1] outof index # --- num_ign = min(max(int(num_neg - neg_rate * num_pos), 0), int((num_neg + num_pos) * 0.95)) inds = np.random.choice(y_true_0_dix[0], num_ign, replace=False) y_true[inds] = 255 # ignore y_true = y_true.reshape(bs, h, w) y_true = torch.from_numpy(y_true).long().cuda() else: y_true = dl_data drive_line_loss = F.cross_entropy(drive_line, dl_data) # drive_line_loss = F.cross_entropy(drive_line, y_true, ignore_index=255) # TODO: Use Segmentation # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) if self.training: # for python 2.7 rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) drive_line_loss = torch.unsqueeze(drive_line_loss, 0) # Drive Line Segmentation # print('torch.max(drive_line)', torch.max(drive_line), drive_line.size()) # ------ END: With Detection return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, drive_line, drive_line_loss
def forward(self, im_data, im_info, gt_boxes, num_boxes, support_ims, all_cls_gt_boxes=None): if self.training: self.num_of_rois = cfg.TRAIN.BATCH_SIZE else: self.num_of_rois = cfg.TEST.RPN_POST_NMS_TOP_N batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data all_cls_gt_boxes = all_cls_gt_boxes.data # feature extraction base_feat = self.RCNN_base(im_data) if self.training: support_ims = support_ims.view(-1, support_ims.size(2), support_ims.size(3), support_ims.size(4)) support_feats = self.prn_network(support_ims) support_feats = support_feats.view(-1, self.n_way * self.n_shot, support_feats.size(1)) pos_support_feat = support_feats[:, :self.n_shot, :].mean(1) neg_support_feat = support_feats[:, self.n_shot:self.n_way * self.n_shot, :].mean(1) else: support_ims = support_ims.view(-1, support_ims.size(2), support_ims.size(3), support_ims.size(4)) support_feats = self.prn_network(support_ims) support_feats = support_feats.view(-1, self.n_shot, support_feats.size(1)) pos_support_feat = support_feats[:, :self.n_shot, :].mean(1) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, all_cls_gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: ## rois [B, rois_per_image(128), 5] ### 5 is [batch_num, x1, y1, x2, y2] ## rois_label [B, 128] ## rois_target [B, 128, 4] roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois, pooled_feat = [B*128, 1024, 7, 7] if cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # [B*128, 2048] # rcnn head if self.training: bbox_pred, cls_prob, cls_score_all = self.rcnn_head( pooled_feat, pos_support_feat) _, neg_cls_prob, neg_cls_score_all = self.rcnn_head( pooled_feat, neg_support_feat) cls_prob = torch.cat([cls_prob, neg_cls_prob], dim=0) cls_score_all = torch.cat([cls_score_all, neg_cls_score_all], dim=0) neg_rois_label = torch.zeros_like(rois_label) rois_label = torch.cat([rois_label, neg_rois_label], dim=0) else: bbox_pred, cls_prob, cls_score_all = self.rcnn_head( pooled_feat, pos_support_feat) # losses if self.training: ## bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) ## classification loss, 2-way, 1:2:1 fg_inds = (rois_label == 1).nonzero().squeeze(-1) bg_inds = (rois_label == 0).nonzero().squeeze(-1) cls_score_softmax = torch.nn.functional.softmax(cls_score_all, dim=1) bg_cls_score_softmax = cls_score_softmax[bg_inds, :] bg_num_0 = max( 1, min(fg_inds.shape[0] * 2, int(rois_label.shape[0] * 0.25))) bg_num_1 = max(1, min(fg_inds.shape[0], bg_num_0)) _sorted, sorted_bg_inds = torch.sort(bg_cls_score_softmax[:, 1], descending=True) real_bg_inds = bg_inds[sorted_bg_inds] # sort the real_bg_inds real_bg_topk_inds_0 = real_bg_inds[real_bg_inds < int( rois_label.shape[0] * 0.5)][:bg_num_0] # pos support real_bg_topk_inds_1 = real_bg_inds[real_bg_inds >= int( rois_label.shape[0] * 0.5)][:bg_num_1] # neg_support topk_inds = torch.cat( [fg_inds, real_bg_topk_inds_0, real_bg_topk_inds_1], dim=0) RCNN_loss_cls = F.cross_entropy(cls_score_all[topk_inds], rois_label[topk_inds]) else: RCNN_loss_cls = 0 RCNN_loss_bbox = 0 return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, meta_data, gt_boxes, num_boxes, run_partial=False): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data meta_data = meta_data.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) avg_feat = self.spatial_pool( base_feat, [base_feat.size()[2], base_feat.size()[3]], 14) weather_label = Variable(meta_data[:, 0].view(-1).long()) altitude_label = Variable(meta_data[:, 1].view(-1).long()) angle_label = Variable(meta_data[:, 2].view(-1).long()) softmax = nn.Softmax(dim=1) altitude_score = self.RCNN_altitude_score( self.RCNN_altitude(avg_feat).mean(-1).mean(-1)) RCNN_loss_altitude = F.cross_entropy(altitude_score, altitude_label) # RCNN_loss_altitude_adv = torch.mean(torch.sum(- altitude_score.new_full(altitude_score.size(), 1 / 3.0) * torch.log(torch.clamp(softmax(altitude_score), min=1e-10, max=1.0)), 1)) RCNN_loss_altitude_adv = torch.mean( torch.sum( softmax(altitude_score) * torch.log( torch.clamp(softmax(altitude_score), min=1e-10, max=1.0)), 1)) correct = altitude_score.max(1)[1].type_as(altitude_label).eq( altitude_label) correct = correct.sum().type(torch.FloatTensor).cuda() RCNN_acc_altitude = correct / altitude_label.size(0) if run_partial: if self.training: RCNN_loss_altitude = torch.unsqueeze(RCNN_loss_altitude, 0) RCNN_loss_altitude_adv = torch.unsqueeze( RCNN_loss_altitude_adv, 0) RCNN_acc_altitude = torch.unsqueeze(RCNN_acc_altitude, 0) return RCNN_loss_altitude, RCNN_loss_altitude_adv, RCNN_acc_altitude # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) RCNN_loss_altitude = torch.unsqueeze(RCNN_loss_altitude, 0) RCNN_loss_altitude_adv = torch.unsqueeze(RCNN_loss_altitude_adv, 0) RCNN_acc_altitude = torch.unsqueeze(RCNN_acc_altitude, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, \ RCNN_loss_altitude, RCNN_loss_altitude_adv, RCNN_acc_altitude, \ rois_label return rois, cls_prob, bbox_pred, RCNN_acc_altitude
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data # rois_label is sub classlable rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 # return roi_data rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen( rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop( base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pspool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # (256,2048,7,7) # nongt_dim = cfg.TRAIN.RPN_POST_NMS_TOP_N if self.training else cfg.TEST.RPN_POST_NMS_TOP_N nongt_dim = 300 if self.training else cfg.TEST.RPN_POST_NMS_TOP_N position_matrix = self.extract_position_matrix( rois.view(-1, 5)[:, :4].clone(), nongt_dim=nongt_dim) position_embedding = self.extract_position_embedding( position_matrix, feat_dim=64) pooled_feat = self.fc1(pooled_feat) attention_feat_1 = self.attention_1(pooled_feat, position_embedding) pooled_feat = pooled_feat + attention_feat_1 pooled_feat = self.fc2(pooled_feat) attention_feat_2 = self.attention_2(pooled_feat, position_embedding) pooled_feat = pooled_feat + attention_feat_2 # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view( bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view( rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss( bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, base_feat, im_info, gt_boxes, num_boxes, crowdsourced_classes=None, alpha_con=None): batch_size = base_feat.size(0) # return feature map after convrelu layer # rpn_conv1 torch.Size([1, 512, 50, 37]) rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score # rpn_cls_score torch.Size([1, 18, 50, 37]) rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) # rpn_cls_prob_reshape torch.Size([1, 2, 450, 37]) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) # rpn_cls_prob torch.Size([1, 18, 50, 37]) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes # rpn_bbox_pred torch.Size([1, 36, 50, 37]) rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None # rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # rpn_label = rpn_data[0].view(batch_size, -1) # rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) # rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) # rpn_label = Variable(rpn_label.long()) # print('before rpn_label: ', rpn_label); # Aggregation Layer if self.label_source == 2: gt_boxes = self.RPN_aggregation( (rpn_cls_prob.data, gt_boxes, num_boxes, im_info, crowdsourced_classes, alpha_con)) # 生成anchor标签(rpn_cls_score 仅提供size 参考) rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) # 不计算label为-1的分类交叉熵 loss rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) if DEBUG: print('after rpn_label: ', rpn_label) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) # torch.Size([1, 36, 50, 37]) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox, num_proposal = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # update 20191026: get the index of nodes in graph for rois (default: batch_size = 1) # if we want to change batch_size, we should consider to change roi2gt_assignment[0] # roi_part_match[0] and roi_part_match_overlap[0] and so on iou_threshold = 0.7 dis_threshold = 0.01 # part_size = 10 # relation_size = 5 iou_size = 6 edge_size = 4 child_size = 4 batch = 0 if True: if not self.training: rois = rois[:, :num_proposal, :] pooled_feat = pooled_feat[:num_proposal, :] # first, calculate the overlaps among rois, set weights in edges between nodes iou>0.7 to 1 overlaps = bbox_overlaps_batch(rois, rois) # overlaps_bin = overlaps.cpu().data.numpy().copy() _, N_node, _ = overlaps.shape # second, calculate the distance among rois, set weights in edges between nodes iou=0 and distances = bbox_distances_batch(rois, rois) # update 20191115: build graph for rois based on index (default: batch_size = 1) # feature cosine similarity # similarity in PGCN dot_product_mat = torch.mm(pooled_feat, torch.transpose(pooled_feat, 0, 1)) len_vec = torch.unsqueeze(torch.sqrt( torch.sum(pooled_feat * pooled_feat, dim=1)), dim=0) len_mat = torch.mm(torch.transpose(len_vec, 0, 1), len_vec) pooled_feat_sim_mat = dot_product_mat / len_mat # cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) # calculate the adj_mat based on iou and distance, the weights on edges are the cosine similarity between nodes mask = torch.eye(N_node, N_node).cuda() for s in range(N_node): overlap_node_index = (overlaps[batch][s] >= iou_threshold).nonzero() overlap_node_size = iou_size if overlap_node_index.shape[ 0] > iou_size else overlap_node_index.shape[0] overlap_node_random = torch.randperm( overlap_node_index.shape[0])[0:overlap_node_size] overlap_node_index_select = overlap_node_index[ overlap_node_random] # TODO(junjie) remove the iou box in distance box. distance_node_index = (distances[batch][s] < dis_threshold).nonzero() distance_node_size = iou_size if distance_node_index.shape[ 0] > iou_size else distance_node_index.shape[0] distance_node_random = torch.randperm( distance_node_index.shape[0])[0:distance_node_size] distance_node_index_select = distance_node_index[ distance_node_random] _node_index_select = torch.cat( (overlap_node_index_select, distance_node_index_select), dim=0) if _node_index_select.shape[0] == 0: continue else: _node_index_select = _node_index_select.squeeze(dim=1) _node_size = child_size if _node_index_select.shape[ 0] > child_size else _node_index_select.shape[0] _node_index_select_random = torch.randperm( _node_index_select.shape[0])[0:_node_size] node_index_select = _node_index_select[ _node_index_select_random] mask[s, node_index_select] = 1 # print("test ") adj_matrix = torch.mul(mask, pooled_feat_sim_mat) pooled_feat = F.relu(self.gcn1(pooled_feat, adj_matrix)) pooled_feat = F.relu(self.gcn2(pooled_feat, adj_matrix)) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) # update 2019-6-17:fix the bug for dimension specified as 0... if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data c1 = self.rcnn_layer0(im_data) c2 = self.rcnn_layer1(c1) c3 = self.rcnn_layer2(c2) c4 = self.rcnn_layer3(c3) c5 = self.rcnn_layer4(c4) # p4 = self.pyramid(p5, c4) # p3 = self.pyramid(p4, c3) # p2 = self.pyramid(p3, c2) # ===================================================== p5 = self.rcnn_toplayer(c5) p4 = self.merge(p5, self.rcnn_latlayer1(c4)) p4 = self.rcnn_smooth1(p4) p3 = self.merge(p4, self.rcnn_latlayer2(c3)) p3 = self.rcnn_smooth2(p3) p2 = self.merge(p3, self.rcnn_latlayer3(c2)) p2 = self.rcnn_smooth3(p2) p6 = self.maxpool2d(p5) # ========================================================== # c6 = self.rcnn_layer5(c5) # p6 = self.rcnn_toplayer(c6) # p5 = self.rcnn_latlayer1(c5) + p6 # p4 = self.rcnn_latlayer2(c4) + p5 # p3 = self.merge(p4, self.rcnn_latlayer3(c3)) # p3 = self.rcnn_smooth1(p3) # p2 = self.merge(p3, self.rcnn_latlayer4(c2)) # p2 = self.rcnn_smooth2(p2) # ============================================================= rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] rois, rpn_loss_cls, rpn_loss_bbox = self.rcnn_rpn(rpn_feature_maps, im_info, gt_boxes, num_boxes) if self.training == True: roi_data = self.rcnn_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label, gt_assign,rois_target,rois_inside_ws,rois_outside_ws = None, None, None, None, None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) roi_pool_feat = self.pyramid_roi(mrcnn_feature_maps, rois, im_info) pooled_feat = self._head_to_tail(roi_pool_feat) bbox_pred = self.rcnn_bbox_pred(pooled_feat) if self.training == True: bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) cls_score = self.rcnn_cls_score(pooled_feat) objectiness = F.softmax(cls_score, dim =1) rcnn_loss_cls = 0 rcnn_loss_bbox = 0 if self.training == True: rcnn_loss_cls = F.cross_entropy(cls_score, rois_label) rcnn_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) objectiness = objectiness.view(batch_size, -1, objectiness.size(1)) # bp() bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) loss = rpn_loss_cls + rpn_loss_bbox + rcnn_loss_cls + rcnn_loss_bbox return rois, objectiness, bbox_pred, rois_label, loss
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 # ========= Union Box ========== whole_box = union_box_layer(rois, im_info) whole_box = whole_box.reshape(whole_box.shape[0], 1, 5) whole = torch.from_numpy(whole_box) whole = whole.type(torch.cuda.FloatTensor) # whole = whole_box.view([-1, 5]) # edges = edge_box_layer(rois, im_info) # edges = torch.from_numpy(edges) # edge = edges.view([-1, 12]) edges_all = edge_whole_layer(rois, im_info) edges_all = torch.from_numpy(edges_all) # whole_rois = torch.cat((whole, rois), 1) rois = Variable(rois) # print rois.size() # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) whole_pool_feat = self.RCNN_roi_align_whole( base_feat, whole.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) whole_pool_feat = self.RCNN_roi_pool(base_feat, whole.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) whole_pool_feat = self._head_to_tail(whole_pool_feat) ##########structure_inference_spmm################# # pooled_feat = structure_inference_spmm(pooled_feat , whole_pool_feat, edges, rois.size()[1]) pooled_feat = self.Structure_inference(edges_all, pooled_feat, whole_pool_feat, rois.size()[1]) # print 'pooled_feat.shape: ', pooled_feat.shape # print 'rois.shape: ', rois.shape # print 'edges.shape: ', edges.shape #coordinate = self.coor_fc( rois[:,:,1:].reshape(rois.shape[1], 4) ) #pooled_feat = torch.cat(( coordinate ,pooled_feat),1) #pooled_feat = torch.add(coordinate, pooled_feat) # ######### external_dim ########### # # external_feature = rois[:,:,3:].view([128,2]) # pooled_feat = self.External(pooled_feat,external_feature) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score,即每个anchor前景和背景的得分 rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes,得到每个anchor偏移的预测值 rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' #rois的shape=(batch, post_top_n, 5), 是排序后并经过nms后的post_top_n个anchor # (经过网络预测的delta修正原始anchor之后的anchor),这些anchor都是映射回MxN的图像的, # 并且经过剪切, 不会超出图像的大小, 每个anchor由1个占位和x1, y1, x2, y2这4个坐标组成 rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None # rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select( rpn_cls_score.view(-1, 2).cpu(), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data1, im_data2, im_info, gt_boxes, num_boxes): batch_size = im_data1.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data if self.fusion_mode == "early": im_data = torch.cat((im_data1,im_data2), dim=1) im_data = self.NIN(im_data) base_feat = self.RCNN_base(im_data) if self.fusion_mode == "half": base_feat1 = self.RCNN_base_half(im_data1) base_feat2 = self.RCNN_base_half(im_data2) base_feat = torch.cat((base_feat1, base_feat2), dim=1) #print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh',base_feat.size()) base_feat = torch.unsqueeze(base_feat, 1) # print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh',base_feat.size()) base_feat = self.NIN(base_feat) # print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh',base_feat.size()) base_feat =torch.squeeze(base_feat ,1) #print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh',base_feat.size()) base_feat = self.RCNN_base_fusion(base_feat) # feed image data to base model to obtain base feature map # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1)) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # get the rpn loss. rpn_loss = rpn_loss_cls + rpn_loss_bbox # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) self.RCNN_loss_cls = 0 self.RCNN_loss_bbox = 0 if self.training: # classification loss label = rois_label.long() self.fg_cnt = torch.sum(label.data.ne(0)) self.bg_cnt = label.data.numel() - self.fg_cnt self.RCNN_loss_cls = F.cross_entropy(cls_score, label) # bounding box regression L1 loss self.RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rcnn_loss = self.RCNN_loss_cls + self.RCNN_loss_bbox cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) # TODO rois_main_label = Variable(rois_label.view(-1).long()) rois_sub_class = list(map( lambda x: self.sub_classes[x], rois_main_label)) rois_main_class = list( map(lambda x: sub2main_dict[x], rois_sub_class)) rois_main_label = list(map( lambda x: self.main_classes.index(x), rois_main_class)) rois_main_label = torch.cuda.LongTensor(rois_main_label) rois_main_label = Variable(rois_main_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_main_label = None rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 # return roi_data rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen( rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop( base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pspool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # main Rcnn branch # feed pooled features to top model pooled_feat_main = self._head_to_tail_main(pooled_feat) # compute bbox offset bbox_pred_main = self.RCNN_bbox_pred_main(pooled_feat_main) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view_main = bbox_pred_main.view( bbox_pred_main.size(0), int(bbox_pred_main.size(1) / 4), 4) bbox_pred_select_main = torch.gather(bbox_pred_view_main, 1, rois_main_label.view( rois_main_label.size(0), 1, 1).expand(rois_main_label.size(0), 1, 4)) bbox_pred_main = bbox_pred_select_main.squeeze(1) # compute object classification probability cls_score_main = self.RCNN_cls_score_main(pooled_feat_main) cls_prob_main = F.softmax(cls_score_main, 1) # sub Rcnn branch pooled_feat_sub = self._head_to_tail_sub(pooled_feat) bbox_pred_sub = self.RCNN_bbox_pred_sub(pooled_feat_sub) if self.training and not self.class_agnostic: bbox_pred_view_sub = bbox_pred_sub.view( bbox_pred_sub.size(0), int(bbox_pred_sub.size(1) / 4), 4) bbox_pred_select_sub = torch.gather(bbox_pred_view_sub, 1, rois_label.view( rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred_sub = bbox_pred_select_sub.squeeze(1) cls_score_sub = self.RCNN_cls_score_sub(pooled_feat_sub) #pdb.set_trace() # process weight of main classes to sub score if 'score' in self.casecade_type: main_cls_weight = torch.cuda.FloatTensor( cls_score_main.size()[0], len(self.sub_classes)) for key, val in self.main2sub_idx_dict.items(): for column_idx in val: main_cls_weight[:, column_idx] = cls_score_main[:, key] if self.casecade_type == 'add_score': cls_score_sub += main_cls_weight elif self.casecade_type == 'mul_score': cls_score_sub *= main_cls_weight cls_prob_sub = F.softmax(cls_score_sub, 1) # process weight of main classes to sub prob if 'prob' in self.casecade_type: main_cls_weight = torch.cuda.FloatTensor( cls_prob_main.size()[0], len(self.sub_classes)) for key, val in self.main2sub_idx_dict.items(): for column_idx in val: main_cls_weight[:, column_idx] = cls_prob_main[:, key] if self.casecade_type == 'add_prob': # TODO normalized cls_prob_sub = cls_prob_sub * self.alpha + (1-self.alpha) * main_cls_weight RCNN_loss_cls_main = 0 RCNN_loss_bbox_main = 0 RCNN_loss_cls_sub = 0 RCNN_loss_bbox_sub = 0 if self.training: # classification loss RCNN_loss_cls_main = F.cross_entropy( cls_score_main, rois_main_label) # TODO roi_lable should RCNN_loss_cls_sub = F.cross_entropy(cls_score_sub, rois_label) # bounding box regression L1 loss RCNN_loss_bbox_main = _smooth_l1_loss( bbox_pred_main, rois_target, rois_inside_ws, rois_outside_ws) RCNN_loss_bbox_sub = _smooth_l1_loss( bbox_pred_main, rois_target, rois_inside_ws, rois_outside_ws) cls_prob_main = cls_prob_main.view(batch_size, rois.size(1), -1) bbox_pred_main = bbox_pred_main.view(batch_size, rois.size(1), -1) cls_prob_sub = cls_prob_sub.view(batch_size, rois.size(1), -1) bbox_pred_sub = bbox_pred_sub.view(batch_size, rois.size(1), -1) if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls_main = torch.unsqueeze(RCNN_loss_cls_main, 0) RCNN_loss_bbox_main = torch.unsqueeze(RCNN_loss_bbox_main, 0) RCNN_loss_cls_sub = torch.unsqueeze(RCNN_loss_cls_sub, 0) RCNN_loss_bbox_sub = torch.unsqueeze(RCNN_loss_bbox_sub, 0) return rois, cls_prob_main, bbox_pred_main, cls_prob_sub, bbox_pred_sub, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls_sub, RCNN_loss_bbox_sub, RCNN_loss_cls_main, RCNN_loss_bbox_main, rois_label
def forward(self, rpn_feature_maps, im_info, gt_boxes, num_boxes): n_feat_maps = len(rpn_feature_maps) rpn_cls_scores = [] rpn_cls_probs = [] rpn_bbox_preds = [] rpn_shapes = [] rpn_rank_inds = [] level_ids = [] # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' nms_pre = cfg[cfg_key].RPN_PRE_NMS_TOP_N batch_size = rpn_feature_maps[0].size(0) for i in range(n_feat_maps): feat_map = rpn_feature_maps[i] # batch_size = feat_map.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(feat_map), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_prob = rpn_cls_score.sigmoid() # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) rpn_shapes.append( [rpn_cls_score.size()[2], rpn_cls_score.size()[3]]) rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1).contiguous().view( batch_size, -1, 1) rpn_cls_prob = rpn_cls_prob.permute(0, 2, 3, 1).contiguous().view( batch_size, -1, 1) rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view( batch_size, -1, 4) ranked_scores, rank_inds = rpn_cls_prob.sort(dim=1, descending=True) rank_inds = rank_inds.view(-1) if rpn_cls_score.shape[1] > nms_pre: rank_inds = rank_inds[:nms_pre] rpn_cls_score = rpn_cls_score[:, rank_inds, :] rpn_bbox_pred = rpn_bbox_pred[:, rank_inds, :] rpn_cls_prob = rpn_cls_prob[:, rank_inds, :] rpn_rank_inds.append(rank_inds) rpn_cls_scores.append(rpn_cls_score) rpn_cls_probs.append(rpn_cls_prob) rpn_bbox_preds.append(rpn_bbox_pred) level_ids.append((rpn_cls_score[0].view(-1)).new_full( ((rpn_cls_score[0].view(-1)).size(0), ), i, dtype=torch.long)) rpn_cls_score_alls = torch.cat(rpn_cls_scores, 1) rpn_cls_prob_alls = torch.cat(rpn_cls_probs, 1) rpn_bbox_pred_alls = torch.cat(rpn_bbox_preds, 1) ids = torch.cat(level_ids) n_rpn_pred = rpn_cls_score_alls.size(1) rois = self.RPN_proposal( (rpn_cls_prob_alls.data, rpn_bbox_pred_alls.data, im_info, cfg_key, rpn_shapes, rpn_rank_inds, ids)) self.rpn_loss_cls = torch.zeros(1).cuda() self.rpn_loss_cls_neg = torch.zeros(1).cuda() self.rpn_loss_box = torch.zeros(1).cuda() # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None BCE = nn.BCEWithLogitsLoss() rpn_data = self.RPN_anchor_target( (rpn_cls_score_alls.data, gt_boxes, im_info, num_boxes, rpn_shapes, rpn_rank_inds)) # compute classification loss rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = rpn_label.view(-1).ne(-1).nonzero().view(-1) rpn_cls_score = torch.index_select(rpn_cls_score_alls.view(-1), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) self.rpn_loss_cls = BCE(rpn_cls_score, rpn_label) # print('rpn_loss_cls', self.rpn_loss_cls) # rpn_label = rpn_label.view(batch_size,-1) # rpn_cls_score = rpn_cls_score.view(batch_size,-1) # for i in range(batch_size): # rpn_label_t = rpn_label[i] # rpn_cls_score_t = rpn_cls_score[i] # rpn_loss_cls_t = BCE(rpn_cls_score_t, rpn_label_t) # print('rpn_loss_cls_t',rpn_loss_cls_t) # self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_inside_weights.size(1), 4)) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_outside_weights.size(1), 4)) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred_alls, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3) return rois, self.rpn_loss_cls, self.rpn_loss_box, self.rpn_loss_cls_neg
def forward(self, im_data, im_info, gt_boxes, gt_boxes_sens, num_boxes): batch_size = im_data[0].size(0) im_info = im_info.data gt_boxes = gt_boxes.data gt_boxes_sens = gt_boxes_sens.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat_c = self.RCNN_base_c(im_data[0]) base_feat_t = self.RCNN_base_t(im_data[1]) base_feat_fused = 0.5 * (base_feat_c + base_feat_t) base_feat_fused = self.RCNN_base_fused(base_feat_fused) conv5_c = self.RCNN_base_f1(base_feat_c) conv5_t = self.RCNN_base_f2(base_feat_t) # feed fused base feature map to RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat_fused, im_info, gt_boxes, num_boxes) # if it is training phase, then use ground truth bboxes for refining if self.training: # 50% jitter probability if np.random.rand(1)[0]>0.5: jitter = (torch.randn(1,256,4)/20).cuda() else: jitter = (torch.zeros(1,256,4)).cuda() # feed jitter to obtain rois_align_target roi_data = self.RCNN_proposal_target(rois, gt_boxes, gt_boxes_sens, num_boxes, jitter, im_info) rois, rois_jittered, rois_label, rois_target, rois_align_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_align_target = Variable(rois_align_target.view(-1, rois_align_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_jittered = copy.deepcopy(rois) rois_label = None rois_target = None rois_align_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 # Region Feature Alignment module ctx_rois = bbox_contextual_batch(rois) clip_boxes(ctx_rois[:,:,1:], im_info, batch_size) ctx_rois = Variable(ctx_rois) ctx_rois_jittered = bbox_contextual_batch(rois_jittered) clip_boxes(ctx_rois_jittered[:,:,1:], im_info, batch_size) ctx_rois_jittered = Variable(ctx_rois_jittered) if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(ctx_rois.view(-1, 5), conv5_c.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat_c = self.RCNN_roi_crop(conv5_c, Variable(grid_yx).detach()) grid_xy = _affine_grid_gen(ctx_rois_jittered.view(-1, 5), conv5_t.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat_t = self.RCNN_roi_crop(conv5_t, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_c = F.max_pool2d(pooled_feat_c, 2, 2) pooled_feat_t = F.max_pool2d(pooled_feat_t, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_c = self.RCNN_roi_align(conv5_c, ctx_rois.view(-1, 5)) pooled_feat_t = self.RCNN_roi_align(conv5_t, ctx_rois_jittered.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_c = self.RCNN_roi_pool(conv5_c, ctx_rois.view(-1,5)) pooled_feat_t = self.RCNN_roi_pool(conv5_t, ctx_rois_jittered.view(-1,5)) pooled_feat_res = pooled_feat_t - pooled_feat_c # feed pooled features to top model pooled_feat_res = self._head_to_tail_align(pooled_feat_res) bbox_align_pred = self.RCNN_bbox_align_pred(pooled_feat_res) RCNN_loss_bbox_align = 0 # Apply bounding-box regression deltas box_deltas = bbox_align_pred.data box_deltas_zeros = torch.zeros(box_deltas.shape).cuda() box_deltas = torch.cat((box_deltas, box_deltas_zeros), 1) # Optionally normalize targets by a precomputed mean and stdev # The roi alignment process is class_agnostic box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(batch_size, -1, 4) rois_sens = rois_jittered.new(rois_jittered.size()).zero_() rois_sens[:,:,1:5] = bbox_transform_inv(rois_jittered[:,:,1:5], box_deltas, batch_size) clip_boxes(rois_sens[:,:,1:5], im_info, batch_size) rois = Variable(rois) rois_sens = Variable(rois_sens) if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois.view(-1, 5), conv5_c.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat_c = self.RCNN_roi_crop(conv5_c, Variable(grid_yx).detach()) grid_xy = _affine_grid_gen(rois_sens.view(-1, 5), conv5_t.size()[2:], self.grid_size) grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat_t = self.RCNN_roi_crop(conv5_t, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_c = F.max_pool2d(pooled_feat_c, 2, 2) pooled_feat_t = F.max_pool2d(pooled_feat_t, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_c = self.RCNN_roi_align(conv5_c, rois.view(-1, 5)) pooled_feat_t = self.RCNN_roi_align(conv5_t, rois_sens.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_c = self.RCNN_roi_pool(conv5_c, rois.view(-1, 5)) pooled_feat_t = self.RCNN_roi_pool(conv5_t, rois_sens.view(-1, 5)) cls_score_ref = self.confidence_ref(self.RCNN_top_ref(pooled_feat_c.view(pooled_feat_c.size(0), -1))) cls_score_sens = self.confidence_sens(self.RCNN_top_sens(pooled_feat_t.view(pooled_feat_t.size(0), -1))) cls_prob_ref = F.softmax(cls_score_ref, 1) cls_prob_sens = F.softmax(cls_score_sens, 1) confidence_ref = torch.abs(cls_prob_ref[:,1]-cls_prob_ref[:,0]) confidence_sens = torch.abs(cls_prob_sens[:,1]-cls_prob_sens[:,0]) confidence_ref = confidence_ref.unsqueeze(1).unsqueeze(2).unsqueeze(3) confidence_sens = confidence_sens.unsqueeze(1).unsqueeze(2).unsqueeze(3) pooled_feat_c = confidence_ref * pooled_feat_c pooled_feat_t = confidence_sens * pooled_feat_t pooled_feat = pooled_feat_c + pooled_feat_t # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_cls_ref = 0 RCNN_loss_cls_sens = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) RCNN_loss_cls_ref = F.cross_entropy(cls_score_ref, rois_label) RCNN_loss_cls_sens = F.cross_entropy(cls_score_sens, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) RCNN_loss_bbox_align = _smooth_l1_loss(bbox_align_pred, rois_align_target[:,:2], rois_inside_ws[:,:2], rois_outside_ws[:,:2]) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, rois_sens, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_cls_ref, RCNN_loss_cls_sens, RCNN_loss_bbox, RCNN_loss_bbox_align, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # update 20191026: get the index of nodes in graph for rois (default: batch_size = 1) # if we want to change batch_size, we should consider to change roi2gt_assignment[0] # roi_part_match[0] and roi_part_match_overlap[0] and so onif self.training: part_threshold = 0.5 # the shape of rois is 1,300,5, however, there is no 300 proposal after nms, so the last of the rois is all 0s rois_none_idx = 300 for i in range(rois.shape[1]): if rois[:, i, :].sum() <= 0: rois_none_idx = i break # # first, calculate the overlaps among rois and gt, get the max roi for each gt (node_cls) overlaps = bbox_overlaps_batch(rois[:, :rois_none_idx, :], rois[:, :rois_none_idx, :])[0] N_node, _ = overlaps.shape overlaps_bin = overlaps.cpu().data.numpy().copy() for j in range(N_node): for k in range(N_node): if overlaps_bin[j][k] >= part_threshold: overlaps_bin[j][k] = 1 else: overlaps_bin[j][k] = 0 if k == j: overlaps_bin[j][k] = 0 idx_subgraph, vertex_subgraph = subgraph_split(overlaps_bin) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # # update 20191105: build graph for rois based on index (default: batch_size = 1) roi_all_idx_list = [] roi_cls_idx_list = [] roi_part_idx_list = [] adj_jud = np.zeros((0)) adj_rois = torch.zeros(0).cuda().long() for k in range(idx_subgraph): idx_k = np.transpose(np.argwhere(vertex_subgraph == k))[0] roi_all_idx_list.append(idx_k) overlaps = overlaps.cpu().data.numpy() # 选度数最大的点作为node_cls for i in range(len(roi_all_idx_list)): rois_idx = roi_all_idx_list[i] # consider the size of rois_select larger than 5, the rois_select is probably an object if rois_idx.shape[0] < 5: continue overlaps_once = overlaps[rois_idx][:, rois_idx] overlaps_once_bin = overlaps_bin[rois_idx][:, rois_idx] N_node_once, _ = overlaps_once.shape ########## update 20191104: select IoU > threshold # for j in range(N_node_once): # for k in range(N_node_once): # if overlaps_once[j][k] >= part_threshold: # overlaps_once[j][k] = 1 # else: # overlaps_once[j][k] = 0 # if k == j: # overlaps_once[j][k] = 0 # overlaps_once = np.sum(overlaps_once, axis=1) # # rois_once_max_idx = np.argmax(overlaps_once) # roi_cls_idx_list.append(rois_idx[rois_once_max_idx]) # # roi_part_tmp = [] # for k in range(rois_idx.shape[0]): # if overlaps[rois_idx[rois_once_max_idx]][k] == 0: # continue # roi_part_tmp.append(rois_idx[k]) # roi_part_tmp = torch.from_numpy(np.array(roi_part_tmp)) # roi_part_idx_list.append(roi_part_tmp) ########## update 20191107: all proposal overlaps_once_bin = np.sum(overlaps_once_bin, axis=1) rois_once_max_idx = np.argmax(overlaps_once_bin) roi_cls_idx_list.append(rois_idx[rois_once_max_idx]) roi_part_tmp = [] roi_iou = overlaps_once[rois_once_max_idx] roi_part_num_threshold = 10 if roi_iou.shape[0] >= roi_part_num_threshold: roi_order = np.argsort(roi_iou)[::-1] for ii in range(roi_part_num_threshold): roi_part_tmp.append(rois_idx[roi_order[ii]]) else: for k in range(rois_idx.shape[0]): if overlaps[rois_idx[rois_once_max_idx]][k] == 0: continue roi_part_tmp.append(rois_idx[k]) roi_part_tmp = torch.from_numpy(np.array(roi_part_tmp)) roi_part_idx_list.append(roi_part_tmp) roi_cls_idx_list = torch.from_numpy(np.array(roi_cls_idx_list)).cuda() for i in range(roi_cls_idx_list.shape[0]): adj_jud = np.concatenate((adj_jud, [1])) adj_rois = torch.cat((adj_rois, roi_cls_idx_list[i:i + 1])) try: if roi_part_idx_list[i].shape[0] != 0: adj_jud = np.concatenate( (adj_jud, np.zeros((roi_part_idx_list[i].shape[0])))) adj_rois = torch.cat( (adj_rois, roi_part_idx_list[i].cuda())) except IndexError: print('IndexError happen, continue') continue node_cls_idx = np.transpose(np.argwhere(adj_jud == 1))[0] adj_matrix_bin = np.zeros((len(adj_jud), len(adj_jud))) # link edges for node_cls to node_cls for k in range(len(node_cls_idx) - 1): idx_node_cls_1 = node_cls_idx[k] idx_node_cls_2 = node_cls_idx[k + 1] adj_matrix_bin[idx_node_cls_1, idx_node_cls_2] = 1 adj_matrix_bin[idx_node_cls_2, idx_node_cls_1] = 1 # link edges for node_cls to related node_part for k in range(len(node_cls_idx) - 1): idx_start = node_cls_idx[k] idx_end = node_cls_idx[k + 1] for s in range(idx_start, idx_end): for t in range(idx_start, idx_end): if s == t: adj_matrix_bin[s, t] = 0 else: adj_matrix_bin[s, t] = 1 cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) adj_matrix = np.zeros((len(adj_jud), len(adj_jud))) for s in range(len(adj_jud)): for t in range(len(adj_jud)): if adj_matrix_bin[s, t] == 1: node_feat_s = pooled_feat[adj_rois[s], :] node_feat_t = pooled_feat[adj_rois[t], :] adj_matrix[s, t] = cos(node_feat_s, node_feat_t) else: adj_matrix[s, t] = 0 adj_matrix = torch.from_numpy(adj_matrix).float().cuda() try: pooled_feat[adj_rois, :] = F.relu( self.gcn1(pooled_feat[adj_rois, :], adj_matrix)) pooled_feat[adj_rois, :] = F.relu( self.gcn2(pooled_feat[adj_rois, :], adj_matrix)) except RuntimeError: print(pooled_feat[adj_rois, :].size()) print(adj_matrix.size()) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) # update 2019-6-17:fix the bug for dimension specified as 0... if self.training: rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, feat, gt_boxes, num_boxes, stage): # feat: acoustic features (we use STFT) [batch_size, seq_len, feat_dim], default [8, 1000, 257] # gt_boxes: ground truth speech segments, the last dimension is (start_frame, end_frame, speaker index) # [batch_size, padded_len, 3], default [8, 20, 3] # num_boxes: number of speech segments in each audio [batch_size], default [8] # stage: specify the stage (can be train, dev or test) batch_size, seq_len, feat_dim = feat.size(0), feat.size(1), feat.size( 2) feat = torch.unsqueeze(feat, 1) feat = torch.transpose(feat, 2, 3) im_info = torch.from_numpy(np.array([[feat_dim, seq_len]])) im_info = im_info.expand(batch_size, im_info.size(1)) gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map # base_feat: deep features after backbone (ResNet101) # [batch_size, num_channels, h, w], default [8, 1024, 16, 63] base_feat = self.RCNN_base(feat) # feed base feature map to RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes, stage) # rois: region of interest(ROI), selected speech segment segments # The last dimension is (batch_idx, start_t, end_t) # [batch_size, number of rois, 3] default: [8, 100, 3] # if it is training phrase, then use ground truth bboxes for refining if stage == "train" or stage == "dev": roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data # rois: selected ROIs to compute loss, the last dimension is (batch_idx, start_t, end_t) # [batch_size, number of rois, 3], default [8, 64, 3] rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) elif stage == "test": rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 else: raise ValueError("Condition not defined.") rois = Variable(rois) # do roi pooling based on predicted rois rois_tmp = rois.new(rois.size(0), rois.size(1), 5).zero_() rois_tmp[:, :, np.array([0, 1, 3]).astype(int)] = rois rois_tmp[:, :, 4] = feat_dim - 1 # default is 'align' if cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois_tmp.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois_tmp.view(-1, 5)) else: raise ValueError("Pooling mode not supported.") # pooled_feat: the pooled feature for speech segments # [batch_size * number of rois, number of channels, 7, 7], default [512, 1024, 7, 7] # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) # compute object classification probability bg_cls_score = self.RCNN_bg_cls_score(pooled_feat) bg_cls_prob = F.softmax(bg_cls_score, 1) seg_embed = self.RCNN_embed(pooled_feat) cls_score = self.RCNN_cls_score(F.relu(seg_embed)) cls_prob = F.softmax(cls_score, 1) RCNN_loss_bg_cls = 0 RCNN_loss_cls = 0 RCNN_loss_bbox = 0 RCNN_loss_cls_spk = 0 if stage == "train" or stage == "dev": # RCNN_loss_cls is the loss to classify fg/bg rois_bg_label = (rois_label > 0).long() RCNN_loss_cls = F.cross_entropy(bg_cls_score, rois_bg_label) cls_score_nonzero, rois_label_nonzero = cls_score[ rois_label != 0, :], rois_label[rois_label != 0] # RCNN_loss_cls_spk is the loss to classify different speakers RCNN_loss_cls_spk = F.cross_entropy(cls_score_nonzero, rois_label_nonzero) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) bg_cls_prob = bg_cls_prob.view(batch_size, rois.size(1), -1) return rois, bg_cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_cls_spk, RCNN_loss_bbox, rois_label, seg_embed
def forward(self, base_feat, im_info, gt_boxes, num_boxes): # base_feat size: [nBacth, nChan, H, W], usually nChan = 1024 batch_size = base_feat.size(0) if self.K > 1: assert batch_size == self.K # stack channels from all images making nBatch = 1 bs, nc, h, w = base_feat.shape base_feat = base_feat.view(1, bs * nc, h, w) # return feature map after convrelu layer rpn_conv1 = F.relu( self.RPN_Conv(base_feat), inplace=True) # size: [nBacth, nChan, H, W], usually nChan = 512 # get rpn classification score rpn_cls_score = self.RPN_cls_score( rpn_conv1) # size: [nBacth, 2 x nAnchors, H, W] if self.K > 1: # we predict same anchor score for all stack images, so we duplicate score making nBatch back to K rpn_cls_score = rpn_cls_score.repeat(self.K, 1, 1, 1) # reshape to perform softmax on bg/fg by sending bg/fg on dim=1 rpn_cls_score_reshape = self.reshape( rpn_cls_score, 2) # size [nBacth, 2, nAnchors x H, W] rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1) # same size as above rpn_cls_prob = self.reshape( rpn_cls_prob_reshape, self.nc_score_out) # back to [nBacth, 2 x nAnchors, H, W] # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred( rpn_conv1) # size [nBacth, 4 x nAnchors, H, W] if self.K > 1: # get the deviation for each stacked image making nBatch back to K rpn_bbox_pred = rpn_bbox_pred.view(self.K, self.nc_bbox_out / self.K, h, w) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' # rois size: [nBacth, numTopProps, 1+4] (last dim: batch_id + 4 coords.), usually numTopProps = 2000 # NOTE for K > 1: ensure that if filtering, NMS, sorting, etc... in RPN_proposal select one proposal # coming from a given anchor in an image, it is also selected in the other images. rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None # gt_boxes size: [nBacth, maxGT, 4+class], usually maxGT = 20 rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) # size [nBatch, nAnchors x H x W] if self.K > 1: for k in range(self.K - 1): assert not rpn_label[0].ne(rpn_label[k]).any() # get index we keep for classification: rpn_keep size [nBatch x rpnBatch] (this is a vector) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) # select proposal we keep rpn_cls_score = torch.index_select(rpn_cls_score.view( -1, 2), 0, rpn_keep) # size [nBatch x rpbBatch, 2] rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) # size [nBatch x rpbBatch] self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) # targets and associated weights have sizes: [nBacth, 4 x nAnchors, H, W] rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map # Bottom-up c1 = self.RCNN_layer0(im_data) c2 = self.RCNN_layer1(c1) c3 = self.RCNN_layer2(c2) c4 = self.RCNN_layer3(c3) c5 = self.RCNN_layer4(c4) # Top-down p5 = self.RCNN_toplayer(c5) #1X1的卷积得到M5特征。 p4 = self._upsample_add(p5, self.RCNN_latlayer1(c4)) p4 = self.RCNN_smooth1(p4) p3 = self._upsample_add(p4, self.RCNN_latlayer2(c3)) p3 = self.RCNN_smooth2(p3) p2 = self._upsample_add(p3, self.RCNN_latlayer3(c2)) p2 = self.RCNN_smooth3(p2) p6 = self.maxpool2d(p5) #到此6张特征图已经全部拿到 rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( rpn_feature_maps, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) # pooling features based on rois, output 14x14 map roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) # feed pooled features to top model pooled_feat = self._head_to_tail(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # loss (cross entropy) for object classification RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # loss (l1-norm) for bounding box regression RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) cls_prob = cls_prob.view(batch_size, -1, cls_prob.size(1)) bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes,target=False,eta=1.0, is_sup=False): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat1 = self.RCNN_base1(im_data) if self.lc: d_pixel, _ = self.netD_pixel(grad_reverse(base_feat1, lambd=eta)) if not target: _, feat_pixel = self.netD_pixel(base_feat1.detach()) else: d_pixel = self.netD_pixel(grad_reverse(base_feat1, lambd=eta)) base_feat = self.RCNN_base2(base_feat1) if self.gc: domain_p, _ = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel,domain_p#, diff _,feat = self.netD(base_feat.detach()) else: domain_p = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel,domain_p#,diff # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox, mask_batch = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes, is_sup) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois # # if cfg.POOLING_MODE == 'align': # pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) # elif cfg.POOLING_MODE == 'pool': # pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5)) pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) if self.lc: feat_pixel = feat_pixel.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat_pixel, pooled_feat), 1) if self.gc: feat = feat.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat, pooled_feat), 1) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label,d_pixel, domain_p#,diff
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map # Bottom-up c1 = self.RCNN_layer0(im_data) c2 = self.RCNN_layer1(c1) c3 = self.RCNN_layer2(c2) c4 = self.RCNN_layer3(c3) c5 = self.RCNN_layer4(c4) c6 = self.RCNN_layer5(c5) # Top-down p6 = self.RCNN_toplayer(c6) p5 = self.RCNN_latlayer1(c5) + p6 p4 = self.RCNN_latlayer2(c4) + p5 p3 = self._upsample_add(p4, self.RCNN_latlayer3(c3)) p3 = self.RCNN_smooth1(p3) p2 = self._upsample_add(p3, self.RCNN_latlayer4(c2)) p2 = self.RCNN_smooth2(p2) rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(rpn_feature_maps, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) # print('before pooling, cfg', cfg.POOLING_MODE) # print('before pooling, get_cfg', get_cfg().POOLING_MODE) # pooling features based on rois, output 14x14 map roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) # feed pooled features to top model pooled_feat = self._head_to_tail(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # loss (cross entropy) for object classification RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # loss (l1-norm) for bounding box regression RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) cls_prob = cls_prob.view(batch_size, -1, cls_prob.size(1)) bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes, target=False, eta=1.0): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat1 = self.RCNN_base1(im_data) if self.lc: d_pixel, _ = self.netD_pixel(grad_reverse(base_feat1, lambd=eta)) # print(d_pixel.mean()) if not target: _, feat_pixel = self.netD_pixel(base_feat1.detach()) else: d_pixel = self.netD_pixel(grad_reverse(base_feat1, lambd=eta)) base_feat = self.RCNN_base2(base_feat1) if self.gc: domain_p, _ = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel, domain_p # , diff _, feat = self.netD(base_feat.detach()) else: domain_p = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel, domain_p # ,diff # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) #feat_pixel = torch.zeros(feat_pixel.size()).cuda() if self.lc: feat_pixel = feat_pixel.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat_pixel, pooled_feat), 1) if self.gc: feat = feat.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat, pooled_feat), 1) # compute bbox offset # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, d_pixel, domain_p # ,diff
def forward(self, im_data, im_info, gt_boxes, num_boxes, target=False, eta=1.0): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # get all vector of class for label if self.training and target: cls_label_ind = torch.unique(gt_boxes[:, :, 4].cpu()) cls_label = torch.zeros(self.n_classes) cls_label[cls_label_ind.long()] = 1 # assume always have backgound categories cls_label[0] = 1 cls_label = cls_label.cuda() cls_label.requires_grad = False # feed image data to base model to obtain base feature map base_feat1 = self.RCNN_base1(im_data) if self.lc: d_pixel, _ = self.netD_pixel_1(grad_reverse(base_feat1, lambd=eta)) # print(d_pixel) if not target: _, feat_pixel = self.netD_pixel_1(base_feat1.detach()) else: d_pixel = self.netD_pixel_1(grad_reverse(base_feat1, lambd=eta)) base_feat2 = self.RCNN_base2(base_feat1) if self.lc: d_pixel_2, _ = self.netD_pixel_2( grad_reverse(base_feat2, lambd=eta)) else: d_pixel_2 = self.netD_pixel_2(grad_reverse(base_feat2, lambd=eta)) base_feat3 = self.RCNN_base3(base_feat2) if self.lc: d_pixel_3, _ = self.netD_pixel_3( grad_reverse(base_feat3, lambd=eta)) else: d_pixel_3 = self.netD_pixel_3(grad_reverse(base_feat3, lambd=eta)) # print(d_pixel_3.mean()) base_feat4 = self.RCNN_base4(base_feat3) if self.gc: d_pixel_4, _ = self.netD_1(grad_reverse(base_feat4, lambd=eta)) else: d_pixel_4 = self.netD_1(grad_reverse(base_feat4, lambd=eta)) # something wrong base_feat = self.RCNN_base5(base_feat4) # for target domain training, we need to return the d_pixel, domain_p if self.gc: domain_p, _ = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel, d_pixel_2, d_pixel_3, d_pixel_4, domain_p _, feat = self.netD(base_feat.detach()) else: domain_p = self.netD(grad_reverse(base_feat, lambd=eta)) if target: return d_pixel, d_pixel_2, d_pixel_3, d_pixel_4, domain_p # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training and not target: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) #feat_pixel = torch.zeros(feat_pixel.size()).cuda() if self.lc: feat_pixel = feat_pixel.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat_pixel, pooled_feat), 1) if self.gc: feat = feat.view(1, -1).repeat(pooled_feat.size(0), 1) pooled_feat = torch.cat((feat, pooled_feat), 1) # compute bbox offset # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic and not target: bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) # compute the sum of weakly score if False: #cls_prob_sum = torch.sum(cls_prob, 0) # x = max(1, x) #cls_prob_sum = cls_prob_sum.repeat(2, 1) #cls_prob_sum = torch.min(cls_prob_sum, 0)[0] max_roi_cls_prob = torch.max(cls_prob, 0)[0] #assert (max_roi_cls_prob.data.cpu().numpy().all() >= 0. and max_roi_cls_prob.data.cpu().numpy().all() <= 1.) if not (max_roi_cls_prob.data.cpu().numpy().all() >= 0. and max_roi_cls_prob.data.cpu().numpy().all() <= 1.): pdb.set_trace() if not (cls_label.data.cpu().numpy().all() >= 0. and cls_label.data.cpu().numpy().all() <= 1.): pdb.set_trace() BCE_loss = F.binary_cross_entropy(max_roi_cls_prob, cls_label) return d_pixel, domain_p, BCE_loss RCNN_loss_cls = 0 RCNN_loss_bbox = 0 # for weakly detection, concentrate the cls_score and calculate the loss if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) # return d_pixel, d_pixel_2, d_pixel_3, d_pixel_4, domain_p return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, d_pixel, d_pixel_2, d_pixel_3, d_pixel_4, domain_p # ,diff
def forward(self, im_data, gt): batch_size = im_data.size(0) gt_boxes = gt['boxes'] # for jacquard dataset, the bounding box labels are set to -1. For training, we set them to 1, which does not # affect the training process. if self.training: if gt_boxes[:, :, -1].sum().item() < 0: gt_boxes[:, :, -1] = -gt_boxes[:, :, -1] gt_grasps = gt['grasps'] gt_grasp_inds = gt['grasp_inds'] num_boxes = gt['num_boxes'] num_grasps = gt['num_grasps'] im_info = gt['im_info'] for i in range(batch_size): if torch.sum(gt_grasp_inds[i]).item() == 0: gt_grasp_inds[i, :num_grasps[i].item()] = 1 # features base_feat = self.base(im_data) rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) else: rois_label = None rpn_loss_cls = 0 rpn_loss_bbox = 0 if cfg.MGN.USE_FIXED_SIZE_ROI: _rois = rois.view(-1, 5) rois_cx = (_rois[:, 1:2] + _rois[:, 3:4]) / 2 rois_cy = (_rois[:, 2:3] + _rois[:, 4:5]) / 2 rois_xmin = torch.clamp(rois_cx - 100, min=1, max=600) rois_ymin = torch.clamp(rois_cy - 100, min=1, max=600) rois_xmax = rois_xmin + 200 rois_ymax = rois_ymin + 200 rois_for_grasp = torch.cat( [_rois[:, :1], rois_xmin, rois_ymin, rois_xmax, rois_ymax], dim=1) if cfg.RCNN_COMMON.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois_for_grasp, base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.RCNN_COMMON.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.RCNN_COMMON.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois_for_grasp) elif cfg.RCNN_COMMON.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois_for_grasp) else: if cfg.RCNN_COMMON.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.RCNN_COMMON.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.RCNN_COMMON.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.RCNN_COMMON.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model # grasp top if self.training: if self._ROIGN_USE_POOLED_FEATS: rois_overlaps = bbox_overlaps_batch(rois, gt_boxes) # bs x N_{rois} _, rois_inds = torch.max(rois_overlaps, dim=2) rois_inds += 1 grasp_rois_mask = rois_label.view(-1) > 0 else: raise NotImplementedError if self.training: if (grasp_rois_mask > 0).sum().item() > 0: grasp_feat = self._ROIGN_head_to_tail( pooled_feat[grasp_rois_mask]) else: # when there are no one positive rois: grasp_loc = Variable(torch.Tensor([]).type_as(gt_grasps)) grasp_prob = Variable(torch.Tensor([]).type_as(gt_grasps)) grasp_bbox_loss = Variable( torch.Tensor([0]).type_as(gt_grasps)) grasp_cls_loss = Variable(torch.Tensor([0]).type_as(gt_grasps)) grasp_conf_label = torch.Tensor([-1]).type_as(rois_label) grasp_all_anchors = torch.Tensor([]).type_as(gt_grasps) return rois, rpn_loss_cls, rpn_loss_bbox, rois_label,\ grasp_loc, grasp_prob, grasp_bbox_loss , grasp_cls_loss, grasp_conf_label, grasp_all_anchors else: grasp_feat = self._ROIGN_head_to_tail(pooled_feat) grasp_pred = self.ROIGN_classifier(grasp_feat) # bs*N x K*A x 5, bs*N x K*A x 2 grasp_loc, grasp_conf = grasp_pred # generate anchors # bs*N x K*A x 5 grasp_all_anchors = self._generate_anchors(grasp_conf.size(1), grasp_conf.size(2), rois) # filter out negative samples grasp_all_anchors = grasp_all_anchors.type_as(gt_grasps) if self.training: grasp_all_anchors = grasp_all_anchors[grasp_rois_mask] # bs*N x 1 x 1 rois_w = (rois[:, :, 3] - rois[:, :, 1]).data.view(-1).unsqueeze(1).unsqueeze(2) rois_h = (rois[:, :, 4] - rois[:, :, 2]).data.view(-1).unsqueeze(1).unsqueeze(2) rois_w = rois_w[grasp_rois_mask] rois_h = rois_h[grasp_rois_mask] # bs*N x 1 x 1 fsx = rois_w / grasp_conf.size(1) fsy = rois_h / grasp_conf.size(2) # bs*N x 1 x 1 xleft = rois[:, :, 1].data.view(-1).unsqueeze(1).unsqueeze(2) ytop = rois[:, :, 2].data.view(-1).unsqueeze(1).unsqueeze(2) xleft = xleft[grasp_rois_mask] ytop = ytop[grasp_rois_mask] # reshape grasp_loc and grasp_conf grasp_loc = grasp_loc.contiguous().view(grasp_loc.size(0), -1, 5) grasp_conf = grasp_conf.contiguous().view(grasp_conf.size(0), -1, 2) grasp_batch_size = grasp_loc.size(0) # bs*N x K*A x 2 grasp_prob = F.softmax(grasp_conf, 2) grasp_bbox_loss = 0 grasp_cls_loss = 0 grasp_conf_label = None if self.training: # inside weights indicate which bounding box should be regressed # outside weidhts indicate two things: # 1. Which bounding box should contribute for classification loss, # 2. Balance cls loss and bbox loss grasp_gt_xywhc = points2labels(gt_grasps) # bs*N x N_{Gr_gt} x 5 grasp_gt_xywhc = self._assign_rois_grasps(grasp_gt_xywhc, gt_grasp_inds, rois_inds) # filter out negative samples grasp_gt_xywhc = grasp_gt_xywhc[grasp_rois_mask] # absolute coords to relative coords grasp_gt_xywhc[:, :, 0:1] -= xleft grasp_gt_xywhc[:, :, 0:1] = torch.clamp(grasp_gt_xywhc[:, :, 0:1], min=0) grasp_gt_xywhc[:, :, 0:1] = torch.min(grasp_gt_xywhc[:, :, 0:1], rois_w) grasp_gt_xywhc[:, :, 1:2] -= ytop grasp_gt_xywhc[:, :, 1:2] = torch.clamp(grasp_gt_xywhc[:, :, 1:2], min=0) grasp_gt_xywhc[:, :, 1:2] = torch.min(grasp_gt_xywhc[:, :, 1:2], rois_h) # grasp training data grasp_loc_label, grasp_conf_label, grasp_iw, grasp_ow = self.ROIGN_proposal_target( grasp_conf, grasp_gt_xywhc, grasp_all_anchors, xthresh=fsx / 2, ythresh=fsy / 2) grasp_keep = Variable( grasp_conf_label.view(-1).ne(-1).nonzero().view(-1)) grasp_conf = torch.index_select(grasp_conf.view(-1, 2), 0, grasp_keep.data) grasp_conf_label = torch.index_select(grasp_conf_label.view(-1), 0, grasp_keep.data) grasp_cls_loss = F.cross_entropy(grasp_conf, grasp_conf_label) grasp_iw = Variable(grasp_iw) grasp_ow = Variable(grasp_ow) grasp_loc_label = Variable(grasp_loc_label) grasp_bbox_loss = _smooth_l1_loss(grasp_loc, grasp_loc_label, grasp_iw, grasp_ow, dim=[2, 1]) return rois, rpn_loss_cls, rpn_loss_bbox, rois_label,\ grasp_loc, grasp_prob, grasp_bbox_loss , grasp_cls_loss, grasp_conf_label, grasp_all_anchors