def sim_loss(self, bboxes_list, nongt_inds_list, rel_feats_list): sim_losses = [] num_img = len(bboxes_list) sim_avg_factor = 0. for img_id in range(num_img): # bboxes pre-proprocess for each image gt_bboxes, nongt_bboxes, valid = self._bboxes_preprocess( bboxes_list[img_id], nongt_inds_list[img_id]) if not valid: sim_losses.append(self._zero_loss(rel_feats_list, img_id)) continue num_gts = gt_bboxes.size(0) num_nongts = nongt_bboxes.size(0) # nongt positive indexes nongt_iou_mat = bbox_overlaps(nongt_bboxes, gt_bboxes) nongt_iof_mat = bbox_overlaps(nongt_bboxes, gt_bboxes, mode='iof') nongt_max_iou, nongt_argmax_iou = nongt_iou_mat.max(dim=1) nongt_iof = nongt_iof_mat[torch.arange(nongt_bboxes.size(0)), nongt_argmax_iou] nongt_pt_inds = (nongt_iof >= self.min_iof) & ( nongt_max_iou >= 0.1) & (nongt_max_iou < 0.5) nongt_pos_inds = (nongt_max_iou >= 0.5) | nongt_pt_inds # similarity matrix sim_mat_list = [] for rel_feat in rel_feats_list: rel_norm = F.normalize(rel_feat[img_id], dim=1) nongt_rel_norm = F.normalize( rel_feat[img_id][nongt_inds_list[img_id], :], dim=1) nongt_rel_norm = nongt_rel_norm.permute(1, 0).contiguous() sim_mat = torch.einsum( 'nc,ck->nk', [rel_norm, nongt_rel_norm]).unsqueeze(dim=0) sim_mat_list.append(sim_mat) # instance-wise contrastive loss for gt_id in range(num_gts): pos_inds, neg_inds, valid = self._get_pos_neg_inds( gt_id, num_nongts, nongt_pos_inds, nongt_argmax_iou) if not valid: sim_losses.append(self._zero_loss(rel_feats_list, img_id)) continue for sim_mat in sim_mat_list: try: sim_loss = self.contrastive_loss( sim_mat=sim_mat[:, gt_id, :], pos_inds=pos_inds, neg_inds=neg_inds) sim_avg_factor += 1. sim_losses.append(sim_loss) except Exception: sim_losses.append( self._zero_loss(rel_feats_list, img_id)) continue return torch.stack(sim_losses).sum().reshape(-1) / (sim_avg_factor + 1e-3)
def iou_loss(pred, target, linear=False, mode='log', eps=1e-6): """IoU loss. Computing the IoU loss between a set of predicted bboxes and target bboxes. The loss is calculated as negative log of IoU. Args: pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2), shape (n, 4). target (torch.Tensor): Corresponding gt bboxes, shape (n, 4). linear (bool, optional): If True, use linear scale of loss instead of log scale. Default: False. mode (str): Loss scaling mode, including "linear", "square", and "log". Default: 'log' eps (float): Eps to avoid log(0). Return: torch.Tensor: Loss tensor. """ assert mode in ['linear', 'square', 'log'] if linear: mode = 'linear' warnings.warn('DeprecationWarning: Setting "linear=True" in ' 'iou_loss is deprecated, please use "mode=`linear`" ' 'instead.') ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) if mode == 'linear': loss = 1 - ious elif mode == 'square': loss = 1 - ious**2 elif mode == 'log': loss = -ious.log() else: raise NotImplementedError return loss
def test_forward(self, proposals, prop_bboxes, asso_probs, cfg, vanish_frames, prev_ids): # valid_tracklet_idxs, append 0 for new objects as the begining. valid_t_idxs = torch.nonzero( vanish_frames < cfg.long_term_frames).squeeze(1) + 1 valid_t_idxs = torch.cat( (torch.tensor([0], dtype=torch.long, device=valid_t_idxs.device), valid_t_idxs)) # Similarity with Softmax asso_scores = torch.zeros_like(asso_probs) asso_scores[:, valid_t_idxs] = F.softmax(asso_probs[:, valid_t_idxs], dim=1) # [N_det, N_emb + 1] asso_scores *= (asso_scores > cfg.asso_score_thre).float() # Overlaps overlaps = torch.zeros_like(asso_scores) valid_prop = prop_bboxes[:, -1] > cfg.prop_score_thre prop_bboxes = prop_bboxes[valid_prop, :] prev_ids = prev_ids[valid_prop] prop_overlaps = bbox_overlaps(proposals[:, :4], prop_bboxes[:, :4]) overlaps[:, prev_ids + 1] = prop_overlaps overlaps *= (overlaps > cfg.prop_overlap_thre).float() if self.affinity == 'overlap': return overlaps elif self.affinity == 'similarity': return asso_scores elif self.affinity == 'all': return overlaps + asso_scores
def kmeans_anchors(self): self.logger.info( f'Start cluster {self.num_anchors} YOLO anchors with K-means...') bboxes = self.get_zero_center_bbox_tensor() cluster_center_idx = torch.randint( 0, bboxes.shape[0], (self.num_anchors, )).to(self.device) assignments = torch.zeros((bboxes.shape[0], )).to(self.device) cluster_centers = bboxes[cluster_center_idx] if self.num_anchors == 1: cluster_centers = self.kmeans_maximization(bboxes, assignments, cluster_centers) anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() anchors = sorted(anchors, key=lambda x: x[0] * x[1]) return anchors prog_bar = mmcv.ProgressBar(self.iters) for i in range(self.iters): converged, assignments = self.kmeans_expectation( bboxes, assignments, cluster_centers) if converged: self.logger.info(f'K-means process has converged at iter {i}.') break cluster_centers = self.kmeans_maximization(bboxes, assignments, cluster_centers) prog_bar.update() print('\n') avg_iou = bbox_overlaps(bboxes, cluster_centers).max(1)[0].mean().item() anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() anchors = sorted(anchors, key=lambda x: x[0] * x[1]) self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}') return anchors
def Diou_loss(pred, target, eps=1e-3): """ cal DIOU of two boxes or batch boxes Computing the DIoU loss between a set of predicted bboxes and target bboxes. Args: pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), shape (n, 4). target (Tensor): Corresponding gt bboxes, shape (n, 4). eps (float): Eps to avoid log(0). Return: Tensor: Loss tensor. """ # cal outer boxes outer_left_top = torch.min(pred[:, :2], target[:, :2]) outer_right_down = torch.max(pred[:, 2:], target[:, 2:]) outer = outer_right_down - outer_left_top outer_diagonal_line = outer[:, 0]**2 + outer[:, 1]**2 # cal center distance pred_ctr = (pred[:, :2] + pred[:, 2:]) * 0.5 target_ctr = (target[:, :2] + target[:, 2:]) * 0.5 ctr_dis = (pred_ctr[:,0]-target_ctr[0])**2 + \ (pred_ctr[:,1]-target_ctr[1])**2 # cal diou ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) dious = ious - ctr_dis / outer_diagonal_line loss = 1 - dious return loss
def merge_results(result1, result2, mode='inter'): if not isinstance(result1, np.ndarray): result1 = np.array(result1) if not isinstance(result2, np.ndarray): result2 = np.array(result2) if mode == 'inter': ious = bbox_overlaps(result1, result2) # n, k max_iou = np.max(ious, axis=1) picks = np.where(max_iou > 0.7) return picks
def update_memo(self, ids, bboxes, embeds, labels, frame_id): tracklet_inds = ids > -1 # update memo for id, bbox, embed, label in zip(ids[tracklet_inds], bboxes[tracklet_inds], embeds[tracklet_inds], labels[tracklet_inds]): id = int(id) if id in self.tracklets.keys(): velocity = (bbox - self.tracklets[id]['bbox']) / ( frame_id - self.tracklets[id]['last_frame']) self.tracklets[id]['bbox'] = bbox self.tracklets[id]['embed'] = ( 1 - self.memo_momentum ) * self.tracklets[id]['embed'] + self.memo_momentum * embed self.tracklets[id]['last_frame'] = frame_id self.tracklets[id]['label'] = label self.tracklets[id]['velocity'] = ( self.tracklets[id]['velocity'] * self.tracklets[id]['acc_frame'] + velocity) / (self.tracklets[id]['acc_frame'] + 1) self.tracklets[id]['acc_frame'] += 1 else: self.tracklets[id] = dict(bbox=bbox, embed=embed, label=label, last_frame=frame_id, velocity=torch.zeros_like(bbox), acc_frame=0) backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1) ious = bbox_overlaps(bboxes[backdrop_inds, :-1], bboxes[:, :-1]) for i, ind in enumerate(backdrop_inds): if (ious[i, :ind] > self.nms_backdrop_iou_thr).any(): backdrop_inds[i] = -1 backdrop_inds = backdrop_inds[backdrop_inds > -1] self.backdrops.insert( 0, dict(bboxes=bboxes[backdrop_inds], embeds=embeds[backdrop_inds], labels=labels[backdrop_inds])) # pop memo invalid_ids = [] for k, v in self.tracklets.items(): if frame_id - v['last_frame'] >= self.memo_tracklet_frames: invalid_ids.append(k) for invalid_id in invalid_ids: self.tracklets.pop(invalid_id) if len(self.backdrops) > self.memo_backdrop_frames: self.backdrops.pop()
def get_iou(pr, gt): if pr.shape[1] == 5: pr = t.FloatTensor(pr)[:, 1:] else: pr = t.FloatTensor(pr) gt = t.FloatTensor(gt)[:, 1:] ious = bbox_overlaps(pr, gt) v, idx = t.max(ious, dim=1) return v.view(-1)
def avg_iou_cost(anchor_params, bboxes): assert len(anchor_params) % 2 == 0 anchor_whs = torch.tensor([ [w, h] for w, h in zip(anchor_params[::2], anchor_params[1::2]) ]).to(bboxes.device, dtype=bboxes.dtype) anchor_boxes = bbox_cxcywh_to_xyxy( torch.cat([torch.zeros_like(anchor_whs), anchor_whs], dim=1)) ious = bbox_overlaps(bboxes, anchor_boxes) max_ious, _ = ious.max(1) cost = 1 - max_ious.mean().item() return cost
def loss(self, rpn_rois, cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights, reduction_override=None): losses = dict() pos_inds = labels > 0 pos_bbox_pred = bbox_pred.view(bbox_pred.size(0), 4)[pos_inds] if len(pos_bbox_pred) > 0: losses['loss_bbox'] = self.loss_bbox( pos_bbox_pred, bbox_targets[pos_inds], bbox_weights[pos_inds], avg_factor=bbox_targets.size(0), reduction_override=reduction_override) avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.) all_boxes = delta2bbox(rpn_rois[:, 1:], bbox_pred, self.target_means, self.target_stds, None) bboxes = all_boxes[pos_inds] labels = labels.float() if len(bboxes) > 0: gtbboxes = delta2bbox(rpn_rois[:, 1:], bbox_targets, self.target_means, self.target_stds, None)[pos_inds] iou_target = bbox_overlaps(bboxes, gtbboxes, 'iou', is_aligned=True) labels[pos_inds] = iou_target losses['loss_cls'] = self.loss_cls( cls_score, labels.view(-1, 1), label_weights.view(-1, 1), avg_factor=avg_factor, reduction_override=reduction_override) pred_bboxes = torch.cat([all_boxes, cls_score], dim=-1) if 'loss_bbox' in losses.keys(): return dict(loss_siamese_rpn_cls=losses['loss_cls'], loss_siamese_rpn_bbox=losses['loss_bbox']), \ pred_bboxes else: return dict(loss_siamese_rpn_cls=losses['loss_cls'], loss_siamese_rpn_bbox=losses['loss_cls'].new_zeros( losses['loss_cls'].shape)), \ pred_bboxes
def assign_gt_single(det_bbox, gt_bbox, pos_iou_thr = 0.5): bboxes = det_bbox[:, :4] overlaps = bbox_overlaps(gt_bbox, bboxes) # for each gt, which predict best overlaps with it # for each gt, the max iou of all predictions max_overlaps, argmax_overlaps = overlaps.max(dim=1) num_gts, num_bboxes = overlaps.size(0), overlaps.size(1) assigned_gt_inds = overlaps.new_full((num_gts,), -1, dtype=torch.long) # assign positive: above positive IoU threshold pos_inds = max_overlaps >= pos_iou_thr assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] return assigned_gt_inds
def loss_single(self, cls_score, pts_pred_init, pts_pred_refine, labels, label_weights, bbox_gt_init, bbox_weights_init, bbox_gt_refine, bbox_weights_refine, stride, num_total_samples_init, num_total_samples_refine): # classification loss labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) cls_score = cls_score.contiguous() # points loss bbox_gt_init = bbox_gt_init.reshape(-1, 4) bbox_weights_init = bbox_weights_init.reshape(-1, 4) bbox_pred_init = self.points2bbox(pts_pred_init.reshape( -1, 2 * self.num_points), y_first=False) bbox_gt_refine = bbox_gt_refine.reshape(-1, 4) bbox_weights_refine = bbox_weights_refine.reshape(-1, 4) bbox_pred_refine = self.points2bbox(pts_pred_refine.reshape( -1, 2 * self.num_points), y_first=False) normalize_term = self.point_base_scale * stride loss_pts_init = self.loss_bbox_init(bbox_pred_init / normalize_term, bbox_gt_init / normalize_term, bbox_weights_init, avg_factor=num_total_samples_init) loss_pts_refine = self.loss_bbox_refine( bbox_pred_refine / normalize_term, bbox_gt_refine / normalize_term, bbox_weights_refine, avg_factor=num_total_samples_refine) if self.use_vfl: pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero().reshape(-1) pos_labels = labels[pos_inds] ious = bbox_overlaps(bbox_pred_refine.detach(), bbox_gt_refine.detach(), is_aligned=True) pos_ious = ious[pos_inds] cls_iou_targets = torch.zeros_like(cls_score) cls_iou_targets[pos_inds, pos_labels] = pos_ious loss_cls = self.loss_cls(cls_score, cls_iou_targets, label_weights.unsqueeze(1), avg_factor=num_total_samples_refine) else: loss_cls = self.loss_cls(cls_score, labels, label_weights, avg_factor=num_total_samples_refine) return loss_cls, loss_pts_init, loss_pts_refine
def giou_loss(pred, target, eps=1e-7): r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression <https://arxiv.org/abs/1902.09630>`_. Args: pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2), shape (n, 4). target (torch.Tensor): Corresponding gt bboxes, shape (n, 4). eps (float): Eps to avoid log(0). Return: Tensor: Loss tensor. """ gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True) loss = 1 - gious return loss
def box_voting(top_dets, all_dets, thresh, scoring_method='ID', beta=1.0): """Apply bounding-box voting to refine `top_dets` by voting with `all_dets`. See: https://arxiv.org/abs/1505.01749. Optional score averaging (not in the referenced paper) can be applied by setting `scoring_method` appropriately. """ # top_dets is [N, 5] each row is [x1 y1 x2 y2, sore] # all_dets is [N, 5] each row is [x1 y1 x2 y2, sore] top_dets_out = top_dets.copy() top_boxes = top_dets[:, :4] all_boxes = all_dets[:, :4] all_scores = all_dets[:, 4] top_to_all_overlaps = bbox_overlaps(top_boxes, all_boxes) for k in range(top_dets_out.shape[0]): inds_to_vote = np.where(top_to_all_overlaps[k] >= thresh)[0] boxes_to_vote = all_boxes[inds_to_vote, :] ws = all_scores[inds_to_vote] top_dets_out[k, :4] = np.average(boxes_to_vote, axis=0, weights=ws) if scoring_method == 'ID': # Identity, nothing to do pass elif scoring_method == 'TEMP_AVG': # Average probabilities (considered as P(detected class) vs. # P(not the detected class)) after smoothing with a temperature # hyperparameter. P = np.vstack((ws, 1.0 - ws)) P_max = np.max(P, axis=0) X = np.log(P / P_max) X_exp = np.exp(X / beta) P_temp = X_exp / np.sum(X_exp, axis=0) P_avg = P_temp[0].mean() top_dets_out[k, 4] = P_avg elif scoring_method == 'AVG': # Combine new probs from overlapping boxes top_dets_out[k, 4] = ws.mean() elif scoring_method == 'IOU_AVG': P = ws ws = top_to_all_overlaps[k, inds_to_vote] P_avg = np.average(P, weights=ws) top_dets_out[k, 4] = P_avg elif scoring_method == 'GENERALIZED_AVG': P_avg = np.mean(ws**beta)**(1.0 / beta) top_dets_out[k, 4] = P_avg elif scoring_method == 'QUASI_SUM': top_dets_out[k, 4] = ws.sum() / float(len(ws))**beta else: raise NotImplementedError( 'Unknown scoring method {}'.format(scoring_method)) return top_dets_out
def iou_loss(pred, target, eps=1e-6): """IoU loss. Computing the IoU loss between a set of predicted bboxes and target bboxes. The loss is calculated as negative log of IoU. Args: pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), shape (n, 4). target (Tensor): Corresponding gt bboxes, shape (n, 4). eps (float): Eps to avoid log(0). Return: Tensor: Loss tensor. """ ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) loss = -ious.log() return loss
def get_bbox_prob_and_overlap(self, points, bbox_preds, gt_bboxes): bbox_targets = bbox2distance(points, gt_bboxes[:, None, :].repeat( 1, points.shape[1], 1), norm=self.distance_norm) bbox_prob = self.loss_bbox(bbox_preds, bbox_targets, reduction_override='none').neg().exp() pred_boxes = distance2bbox(points, bbox_preds, norm=self.distance_norm) bbox_overlap = bbox_overlaps(gt_bboxes[:, None, :].expand_as(pred_boxes), pred_boxes, is_aligned=True) return bbox_prob, bbox_overlap
def predict_weights(self, cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights, anchors, loss_cls, loss_bbox): labels = labels.reshape(-1, ) pos_inds = labels > 0 postive_score = cls_score[pos_inds, labels[pos_inds] - 1].sigmoid() pos_pred = bbox_pred[pos_inds] pos_proposals = anchors[pos_inds] pos_bbox = delta2bbox(pos_proposals, pos_pred, means=self.target_means, stds=self.target_stds) pos_targets = bbox_targets[pos_inds] gt_bboxes = delta2bbox(pos_proposals, pos_targets, means=self.target_means, stds=self.target_stds) ious = bbox_overlaps(gt_bboxes, pos_bbox, is_aligned=True).view(-1, ) total_ious = ious.new_full((pos_inds.numel(),), 0.0) total_ious[pos_inds] = ious total_scores = postive_score.new_full((pos_inds.numel(),), 0.0) total_scores[pos_inds] = postive_score uncertainty_prediction = self.uncertainty_predictor( total_ious, total_scores, loss_cls.sum(dim=1).detach().data, loss_bbox.detach().data ) losses = dict() uncertainty_prediction_cls = uncertainty_prediction[:, 0] uncertainty_prediction_reg = uncertainty_prediction[:, 1] uncertainty_prediction_cls = torch.clamp(uncertainty_prediction_cls, min=self.cls_prediction_min, max=self.cls_prediction_max) uncertainty_prediction_reg = torch.clamp(uncertainty_prediction_reg, min=self.reg_prediction_min, max=self.reg_prediction_max) uncertainty_prediction_cls = torch.ones_like( uncertainty_prediction_cls) * uncertainty_prediction_cls.mean() losses.update({ "loss_uncertainty_cls": uncertainty_prediction_cls.sum() / uncertainty_prediction_cls.numel() * self.uncertainty_cls_weight}) losses.update({ "loss_uncertainty_reg": uncertainty_prediction_reg[ pos_inds].mean() * self.uncertainty_reg_weight}) uncertainty_prediction_reg = torch.exp(-1. * uncertainty_prediction_reg) uncertainty_prediction_cls = torch.exp(-1. * uncertainty_prediction_cls) losses.update({ "cls_prediction_pos": uncertainty_prediction_cls[pos_inds].mean(), "cls_prediction_neg": uncertainty_prediction_cls[~pos_inds].mean(), "cls_prediction_reg": uncertainty_prediction_reg[pos_inds].mean(), }) bbox_weights = bbox_weights.detach().data * uncertainty_prediction_reg.view(-1, 1) label_weights = label_weights.detach().data * uncertainty_prediction_cls.view(-1, 1) return label_weights, bbox_weights, losses
def get_bbox_prob_and_overlap(self, anchors, bbox_preds, gt_bboxes): bbox_targets = bbox2delta( anchors, gt_bboxes[:, None, :].expand_as(anchors), self.target_means, self.target_stds ) bbox_prob = self.loss_bbox(bbox_preds, bbox_targets, reduction_override='none').sum(dim=-1).neg().exp() pred_boxes = delta2bbox( anchors, bbox_preds, self.target_means, self.target_stds ) bbox_overlap = bbox_overlaps(gt_bboxes[:, None, :].expand_as(pred_boxes), pred_boxes, is_aligned=True) return bbox_prob, bbox_overlap
def assign_ids(self, ids, det_bboxes, weight_iou_with_det_scores=False, match_iou_thr=0.5): """Assign ids. Args: ids (list[int]): Tracking ids. det_bboxes (Tensor): of shape (N, 5) weight_iou_with_det_scores (bool, optional): Whether using detection scores to weight IOU which is used for matching. Defaults to False. match_iou_thr (float, optional): Matching threshold. Defaults to 0.5. Returns: tuple(int): The assigning ids. """ # get track_bboxes track_bboxes = np.zeros((0, 4)) for id in ids: track_bboxes = np.concatenate( (track_bboxes, self.tracks[id].mean[:4][None]), axis=0) track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes) track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes) # compute distance ious = bbox_overlaps(track_bboxes, det_bboxes[:, :4]) if weight_iou_with_det_scores: ious *= det_bboxes[:, 4][None] dists = (1 - ious).cpu().numpy() # bipartite match if dists.size > 0: cost, row, col = lap.lapjv( dists, extend_cost=True, cost_limit=1 - match_iou_thr) else: row = np.zeros(len(ids)).astype(np.int32) - 1 col = np.zeros(len(det_bboxes)).astype(np.int32) - 1 return row, col
def box_filter(boxes, must_overlap=False): """ Only include boxes that overlap as possible relations. If no overlapping boxes, use all of them.""" n_cands = boxes.shape[0] overlaps = bbox_overlaps(torch.from_numpy(boxes), torch.from_numpy(boxes)) > 0 overlaps = overlaps.data.numpy() np.fill_diagonal(overlaps, 0) all_possib = np.ones_like(overlaps, dtype=np.bool) np.fill_diagonal(all_possib, 0) if must_overlap: possible_boxes = np.column_stack(np.where(overlaps)) if possible_boxes.size == 0: possible_boxes = np.column_stack(np.where(all_possib)) else: possible_boxes = np.column_stack(np.where(all_possib)) return possible_boxes
def get_match_score(self, bboxes, labels, prev_bboxes, prev_labels, similarity_logits): """Get the match score. Args: bboxes (torch.Tensor): of shape (num_current_bboxes, 5) in [tl_x, tl_y, br_x, br_y, score] format. Denoting the detection bboxes of current frame. labels (torch.Tensor): of shape (num_current_bboxes, ) prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 5) in [tl_x, tl_y, br_x, br_y, score] format. Denoting the detection bboxes of previous frame. prev_labels (torch.Tensor): of shape (num_previous_bboxes, ) similarity_logits (torch.Tensor): of shape (num_current_bboxes, num_previous_bboxes + 1). Denoting the similarity logits from track head. Returns: torch.Tensor: The matching score of shape (num_current_bboxes, num_previous_bboxes + 1) """ similarity_scores = similarity_logits.softmax(dim=1) ious = bbox_overlaps(bboxes[:, :4], prev_bboxes[:, :4]) iou_dummy = ious.new_zeros(ious.shape[0], 1) ious = torch.cat((iou_dummy, ious), dim=1) label_deltas = (labels.view(-1, 1) == prev_labels).float() label_deltas_dummy = label_deltas.new_ones(label_deltas.shape[0], 1) label_deltas = torch.cat((label_deltas_dummy, label_deltas), dim=1) match_score = similarity_scores.log() match_score += self.match_weights['det_score'] * \ bboxes[:, 4].view(-1, 1).log() match_score += self.match_weights['iou'] * ious match_score += self.match_weights['det_label'] * label_deltas return match_score
def iou_loss(pred, target, linear=False, eps=1e-6): """IoU loss. Computing the IoU loss between a set of predicted bboxes and target bboxes. The loss is calculated as negative log of IoU. Args: pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2), shape (n, 4). target (torch.Tensor): Corresponding gt bboxes, shape (n, 4). linear (bool, optional): If True, use linear scale of loss instead of log scale. Default: False. eps (float): Eps to avoid log(0). Return: torch.Tensor: Loss tensor. """ ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) if linear: loss = 1 - ious else: loss = -ious.log() return loss
def loss_single(self, cls_score, bbox_pred, labels, label_weights, level, bbox_targets, bbox_weights, num_total_samples, cfg): #generate anchors anchors = self.anchor_generators[level].grid_anchors(self.featmap_sizes[level], self.anchor_strides[level]) anchors = anchors.repeat(2,1) # classification loss labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) # regression loss bbox_targets = bbox_targets.reshape(-1, 4) bbox_weights = bbox_weights.reshape(-1, 4) bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) if 'is_iou' in cfg.keys() and cfg['is_iou'] == True: #get IOU bbox = delta2bbox(anchors, bbox_pred, self.target_means, self.target_stds) ious = bbox_overlaps(bbox, bbox_targets, is_aligned=True) loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples,ious=ious) loss_bbox = self.loss_bbox( bbox_pred, bbox_targets, bbox_weights, avg_factor=num_total_samples) else: loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples) loss_bbox = self.loss_bbox( bbox_pred, bbox_targets, bbox_weights, avg_factor=num_total_samples) return loss_cls, loss_bbox
def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None): """Calculate recalls. Args: gts(list or ndarray): a list of arrays of shape (n, 4) proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5) proposal_nums(int or list of int or ndarray): top N proposals thrs(float or list or ndarray): iou thresholds Returns: ndarray: recalls of different ious and proposal nums """ img_num = len(gts) assert img_num == len(proposals) proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs) all_ious = [] for i in range(img_num): if proposals[i].ndim == 2 and proposals[i].shape[1] == 5: scores = proposals[i][:, 4] sort_idx = np.argsort(scores)[::-1] img_proposal = proposals[i][sort_idx, :] else: img_proposal = proposals[i] prop_num = min(img_proposal.shape[0], proposal_nums[-1]) if gts[i] is None or gts[i].shape[0] == 0: ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32) else: ious = bbox_overlaps(torch.tensor(gts[i]), torch.tensor(img_proposal[:prop_num, :4])) ious = ious.data.numpy() all_ious.append(ious) all_ious = np.array(all_ious) recalls = _recalls(all_ious, proposal_nums, iou_thrs) return recalls
def get_roi_mask(self, cls_scores, img_metas, gt_bboxes, phi=0.5): featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] from mmdet.core import bbox_overlaps with torch.no_grad(): anchor_list, _ = self.get_anchors(featmap_sizes, img_metas) mask_batch = [] for batch in range(len(gt_bboxes)): mask_level = [] target_lvls = self._map_roi_levels(gt_bboxes[batch], len(anchor_list[batch])) for level in range(len(anchor_list[batch])): gt_level = gt_bboxes[batch][target_lvls == level] h, w = featmap_sizes[level][0], featmap_sizes[level][1] mask_per_img = torch.zeros([h, w], dtype=torch.double).cuda() if gt_level.shape[0] > 0: IoU_map = bbox_overlaps(anchor_list[batch][level], gt_level) max_iou, _ = torch.max(IoU_map, dim=0) IoU_map = IoU_map.view(h, w, self.num_anchors, -1) for ins in range(gt_level.shape[0]): max_iou_per_gt = max_iou[ins] * phi mask_per_gt = torch.sum( IoU_map[:, :, :, ins] > max_iou_per_gt, dim=2) mask_per_img += mask_per_gt mask_per_img = (mask_per_img > 0).double() mask_level.append(mask_per_img) mask_batch.append(mask_level) mask_batch_level = [] for i in range(len(mask_batch[0])): tmp = [] for batch in range(len(mask_batch)): tmp.append(mask_batch[batch][i]) mask_batch_level.append(torch.stack(tmp, dim=0)) return mask_batch_level
def loss(self, cls_scores, bbox_preds, centernesses, cof_preds, feat_masks, gt_bboxes, gt_labels, img_metas, cfg, gt_bboxes_ignore=None, gt_masks_list=None): assert len(cls_scores) == len(bbox_preds) == len(centernesses) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points, all_level_strides = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, bbox_targets, label_list, bbox_targets_list, gt_inds = self.fcos_target(all_level_points, gt_bboxes, gt_labels) #decode detection and groundtruth det_bboxes = [] det_targets = [] num_levels = len(bbox_preds) for img_id in range(len(img_metas)): bbox_pred_list = [ bbox_preds[i][img_id].permute(1, 2, 0).reshape(-1, 4).detach() for i in range(num_levels) ] bbox_target_list = bbox_targets_list[img_id] bboxes = [] targets = [] for i in range(len(bbox_pred_list)): bbox_pred = bbox_pred_list[i] bbox_target = bbox_target_list[i] points = all_level_points[i] bboxes.append(distance2bbox(points, bbox_pred)) targets.append(distance2bbox(points, bbox_target)) bboxes = torch.cat(bboxes, dim=0) targets = torch.cat(targets, dim=0) det_bboxes.append(bboxes) det_targets.append(targets) gt_masks = [] for i in range(len(gt_labels)): gt_label = gt_labels[i] gt_masks.append(torch.from_numpy(np.array(gt_masks_list[i][:gt_label.shape[0]], dtype=np.float32)).to(gt_label.device)) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and centerness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) for bbox_pred in bbox_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_centerness = torch.cat(flatten_centerness) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) flatten_strides = torch.cat( [strides.view(-1,1).repeat(num_imgs, 1) for strides in all_level_strides]) pos_inds = flatten_labels.nonzero().reshape(-1) num_pos = len(pos_inds) loss_cls = self.loss_cls( flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs) # avoid num_pos is 0 pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_centerness = flatten_centerness[pos_inds] if num_pos > 0: pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_centerness_targets = self.centerness_target(pos_bbox_targets) pos_points = flatten_points[pos_inds] pos_strides = flatten_strides[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds/pos_strides) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets/pos_strides) # centerness weighted iou loss loss_bbox = self.loss_bbox( pos_decoded_bbox_preds, pos_decoded_target_preds, weight=pos_centerness_targets, avg_factor=pos_centerness_targets.sum()) loss_centerness = self.loss_centerness(pos_centerness, pos_centerness_targets) else: loss_bbox = pos_bbox_preds.sum() loss_centerness = pos_centerness.sum() ##########mask loss################# flatten_cls_scores1 = [ cls_score.permute(0, 2, 3, 1).reshape(num_imgs,-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_cls_scores1 = torch.cat(flatten_cls_scores1,dim=1) flatten_cof_preds = [ cof_pred.permute(0, 2, 3, 1).reshape(cof_pred.shape[0],-1, 32*4) for cof_pred in cof_preds ] loss_mask = 0 loss_iou = 0 num_iou = 0.1 flatten_cof_preds = torch.cat(flatten_cof_preds,dim=1) for i in range(num_imgs): labels = torch.cat([labels_level.flatten() for labels_level in label_list[i]]) bbox_dt = det_bboxes[i]/2 bbox_dt = bbox_dt.detach() pos_inds = (labels > 0).nonzero().view(-1) cof_pred = flatten_cof_preds[i][pos_inds] img_mask = feat_masks[i] mask_h = img_mask.shape[1] mask_w = img_mask.shape[2] idx_gt = gt_inds[i] bbox_dt = bbox_dt[pos_inds, :4] area = (bbox_dt[:, 2] - bbox_dt[:, 0]) * (bbox_dt[:, 3] - bbox_dt[:, 1]) bbox_dt = bbox_dt[area > 1.0, :] idx_gt = idx_gt[area > 1.0] cof_pred = cof_pred[area > 1.0] if bbox_dt.shape[0] == 0: loss_mask += area.sum()*0 continue bbox_gt = gt_bboxes[i] cls_score = flatten_cls_scores1[i, pos_inds, labels[pos_inds] - 1].sigmoid().detach() cls_score = cls_score[area>1.0] pos_inds = pos_inds[area > 1.0] ious = bbox_overlaps(bbox_gt[idx_gt]/2, bbox_dt, is_aligned=True) with torch.no_grad(): weighting = cls_score * ious weighting = weighting/(torch.sum(weighting)+0.0001)*len(weighting) gt_mask = F.interpolate(gt_masks[i].unsqueeze(0), scale_factor=0.5, mode='bilinear', align_corners=False).squeeze(0) shape = np.minimum(feat_masks[i].shape, gt_mask.shape) gt_mask_new = gt_mask.new_zeros(gt_mask.shape[0], mask_h, mask_w) gt_mask_new[:gt_mask.shape[0], :shape[1], :shape[2]] = gt_mask[:gt_mask.shape[0], :shape[1], :shape[2]] gt_mask_new = gt_mask_new.gt(0.5).float() gt_mask_new = torch.index_select(gt_mask_new,0,idx_gt).permute(1, 2, 0).contiguous() #######spp########################### img_mask1 = img_mask.permute(1,2,0) pos_masks00 = torch.sigmoid(img_mask1 @ cof_pred[:, 0:32].t()) pos_masks01 = torch.sigmoid(img_mask1 @ cof_pred[:, 32:64].t()) pos_masks10 = torch.sigmoid(img_mask1 @ cof_pred[:, 64:96].t()) pos_masks11 = torch.sigmoid(img_mask1 @ cof_pred[:, 96:128].t()) pred_masks = torch.stack([pos_masks00, pos_masks01, pos_masks10, pos_masks11], dim=0) pred_masks = self.crop_cuda(pred_masks, bbox_dt) gt_mask_crop = self.crop_gt_cuda(gt_mask_new, bbox_dt) # pred_masks, gt_mask_crop = crop_split(pos_masks00, pos_masks01, pos_masks10, pos_masks11, bbox_dt, # gt_mask_new) pre_loss = F.binary_cross_entropy(pred_masks, gt_mask_crop, reduction='none') pos_get_csize = center_size(bbox_dt) gt_box_width = pos_get_csize[:, 2] gt_box_height = pos_get_csize[:, 3] pre_loss = pre_loss.sum(dim=(0, 1)) / gt_box_width / gt_box_height / pos_get_csize.shape[0] loss_mask += torch.sum(pre_loss*weighting.detach()) if self.rescoring_flag: pos_labels = labels[pos_inds] - 1 input_iou = pred_masks.detach().unsqueeze(0).permute(3, 0, 1, 2) pred_iou = self.convs_scoring(input_iou) pred_iou = self.relu(self.mask_scoring(pred_iou)) pred_iou = F.max_pool2d(pred_iou, kernel_size=pred_iou.size()[2:]).squeeze(-1).squeeze(-1) pred_iou = pred_iou[range(pred_iou.size(0)), pos_labels] with torch.no_grad(): mask_pred = (pred_masks > 0.4).float() mask_pred_areas = mask_pred.sum((0, 1)) overlap_areas = (mask_pred * gt_mask_new).sum((0, 1)) gt_full_areas = gt_mask_new.sum((0, 1)) iou_targets = overlap_areas / (mask_pred_areas + gt_full_areas - overlap_areas + 0.1) iou_weights = ((iou_targets > 0.1) & (iou_targets <= 1.0) & (gt_full_areas >= 10 * 10)).float() loss_iou += self.loss_iou(pred_iou.view(-1, 1), iou_targets.view(-1, 1), iou_weights.view(-1, 1)) num_iou += torch.sum(iou_weights.detach()) loss_mask = loss_mask/num_imgs if self.rescoring_flag: loss_iou = loss_iou * 10 / num_iou.detach() return dict( loss_cls=loss_cls, loss_bbox=loss_bbox, loss_centerness=loss_centerness, loss_mask=loss_mask, loss_iou=loss_iou) else: return dict( loss_cls=loss_cls, loss_bbox=loss_bbox, loss_centerness=loss_centerness, loss_mask=loss_mask)
def isr_p(cls_score, bbox_pred, bbox_targets, rois, sampling_results, loss_cls, bbox_coder, k=2, bias=0, num_class=80): """Importance-based Sample Reweighting (ISR_P), positive part. Args: cls_score (Tensor): Predicted classification scores. bbox_pred (Tensor): Predicted bbox deltas. bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are labels, label_weights, bbox_targets, bbox_weights, respectively. rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs (two_stage) in shape (n, 5). sampling_results (obj): Sampling results. loss_cls (func): Classification loss func of the head. bbox_coder (obj): BBox coder of the head. k (float): Power of the non-linear mapping. bias (float): Shift of the non-linear mapping. num_class (int): Number of classes, default: 80. Return: tuple([Tensor]): labels, imp_based_label_weights, bbox_targets, bbox_target_weights """ labels, label_weights, bbox_targets, bbox_weights = bbox_targets pos_label_inds = ((labels >= 0) & (labels < num_class)).nonzero().reshape(-1) pos_labels = labels[pos_label_inds] # if no positive samples, return the original targets num_pos = float(pos_label_inds.size(0)) if num_pos == 0: return labels, label_weights, bbox_targets, bbox_weights # merge pos_assigned_gt_inds of per image to a single tensor gts = list() last_max_gt = 0 for i in range(len(sampling_results)): gt_i = sampling_results[i].pos_assigned_gt_inds gts.append(gt_i + last_max_gt) if len(gt_i) != 0: last_max_gt = gt_i.max() + 1 gts = torch.cat(gts) assert len(gts) == num_pos cls_score = cls_score.detach() bbox_pred = bbox_pred.detach() # For single stage detectors, rois here indicate anchors, in shape (N, 4) # For two stage detectors, rois are in shape (N, 5) if rois.size(-1) == 5: pos_rois = rois[pos_label_inds][:, 1:] else: pos_rois = rois[pos_label_inds] if bbox_pred.size(-1) > 4: bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4) pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4) else: pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4) # compute iou of the predicted bbox and the corresponding GT pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4) pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred) target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target) ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True) pos_imp_weights = label_weights[pos_label_inds] # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally, # then sorted again within the same-rank group max_l_num = pos_labels.bincount().max() for label in pos_labels.unique(): l_inds = (pos_labels == label).nonzero().view(-1) l_gts = gts[l_inds] for t in l_gts.unique(): t_inds = l_inds[l_gts == t] t_ious = ious[t_inds] _, t_iou_rank_idx = t_ious.sort(descending=True) _, t_iou_rank = t_iou_rank_idx.sort() ious[t_inds] += max_l_num - t_iou_rank.float() l_ious = ious[l_inds] _, l_iou_rank_idx = l_ious.sort(descending=True) _, l_iou_rank = l_iou_rank_idx.sort() # IoU-HLR # linearly map HLR to label weights pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k) # normalize to make the new weighted loss value equal to the original loss pos_loss_cls = loss_cls(cls_score[pos_label_inds], pos_labels, reduction_override='none') if pos_loss_cls.dim() > 1: ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:, None] new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None] else: ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds] new_pos_loss_cls = pos_loss_cls * pos_imp_weights pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum() pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio label_weights[pos_label_inds] = pos_imp_weights bbox_targets = labels, label_weights, bbox_targets, bbox_weights return bbox_targets
def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights, bbox_targets, stride, soft_targets, num_total_samples): """Compute loss of a single scale level. Args: anchors (Tensor): Box reference for each scale level with shape (N, num_total_anchors, 4). cls_score (Tensor): Cls and quality joint scores for each scale level has shape (N, num_classes, H, W). bbox_pred (Tensor): Box distribution logits for each scale level with shape (N, 4*(n+1), H, W), n is max value of integral set. labels (Tensor): Labels of each anchors with shape (N, num_total_anchors). label_weights (Tensor): Label weights of each anchor with shape (N, num_total_anchors) bbox_targets (Tensor): BBox regression targets of each anchor wight shape (N, num_total_anchors, 4). stride (tuple): Stride in this scale level. num_total_samples (int): Number of positive samples that is reduced over all GPUs. Returns: dict[tuple, Tensor]: Loss components and weight targets. """ assert stride[0] == stride[1], 'h stride is not equal to w stride!' anchors = anchors.reshape(-1, 4) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4 * (self.reg_max + 1)) soft_targets = soft_targets.permute(0, 2, 3, 1).reshape(-1, 4 * (self.reg_max + 1)) bbox_targets = bbox_targets.reshape(-1, 4) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) score = label_weights.new_zeros(labels.shape) if len(pos_inds) > 0: pos_bbox_targets = bbox_targets[pos_inds] pos_bbox_pred = bbox_pred[pos_inds] pos_anchors = anchors[pos_inds] pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0] weight_targets = cls_score.detach().sigmoid() weight_targets = weight_targets.max(dim=1)[0][pos_inds] pos_bbox_pred_corners = self.integral(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_anchor_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride[0] score[pos_inds] = bbox_overlaps(pos_decode_bbox_pred.detach(), pos_decode_bbox_targets, is_aligned=True) pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1) pos_soft_targets = soft_targets[pos_inds] soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1) target_corners = bbox2distance(pos_anchor_centers, pos_decode_bbox_targets, self.reg_max).reshape(-1) # regression loss loss_bbox = self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets, weight=weight_targets, avg_factor=1.0) # dfl loss loss_dfl = self.loss_dfl(pred_corners, target_corners, weight=weight_targets[:, None].expand( -1, 4).reshape(-1), avg_factor=4.0) # ld loss loss_ld = self.loss_ld(pred_corners, soft_corners, weight=weight_targets[:, None].expand( -1, 4).reshape(-1), avg_factor=4.0) else: loss_ld = bbox_pred.sum() * 0 loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = bbox_pred.new_tensor(0) # cls (qfl) loss loss_cls = self.loss_cls(cls_score, (labels, score), weight=label_weights, avg_factor=num_total_samples) return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum()
def assign_ids(self, x, prop_bboxes, asso_probs, det_bboxes, det_labels, img_meta, rescale): """Integrate matching score through 1. Association score with softmax 2. Semantic consistence 3. Spatial overlap (Only in consective frames) 4. Detection confidence """ cfg = self.test_cfg.track # id init ids = torch.zeros_like(det_bboxes[:, 1]).long() - 1 # Get semantic consistence, as a flag for assigning ids. cat_same = (self.labels == det_labels.view(-1, 1)).float() cat_dummy = cat_same.new_ones(cat_same.size(0), 1) cat_same = torch.cat((cat_dummy, cat_same), dim=1) # calculate feature appearance similarity valid_t_idxs = torch.nonzero( self.vanish_frames < cfg.long_term_frames).squeeze(1) + 1 valid_t_idxs = torch.cat( (torch.tensor([0], dtype=torch.long, device=valid_t_idxs.device), valid_t_idxs)) if asso_probs.size(-1) < cat_same.size(-1): # Sigmoid or Cosine assert asso_probs.size(-1) + 1 == cat_same.size(-1) asso_scores = torch.zeros_like(cat_same) if asso_scores.max() > 1.0: asso_scores[:, 1:] = torch.sigmoid(asso_probs) asso_scores[:, 0] += 0.5 else: # Softmax assert asso_probs.size(-1) == cat_same.size(-1) asso_scores = torch.zeros_like(cat_same) asso_scores[:, valid_t_idxs] = F.softmax(asso_probs[:, valid_t_idxs], dim=1) # [N_det, N_emb + 1] asso_scores *= (asso_scores > cfg.asso_score_thre).float() # get overlaps under short-term tracking overlaps = torch.zeros_like(cat_same) valid_prop = prop_bboxes[:, -1] > cfg.prop_score_thre prop_bboxes = prop_bboxes[valid_prop, :] prev_ids = self.prev_ids[valid_prop] prop_overlaps = bbox_overlaps(det_bboxes[:, :4], prop_bboxes[:, :4]) overlaps[:, prev_ids + 1] = prop_overlaps overlaps *= (overlaps > cfg.prop_overlap_thre).float() # short-term matching according to overlaps short_scores = overlaps * cat_same if cfg.clean_before_short_assign: valid_dets = det_bboxes[:, -1] > cfg.new_obj_score_thre valid_dets = valid_dets.view(-1, 1).repeat(1, short_scores.size(1)) short_scores = short_scores * valid_dets.float() if prev_ids.shape[0] > 0 and (short_scores > 0).any(): t2d_idxs = max_matching(short_scores[:, prev_ids + 1]) is_match = t2d_idxs >= 0 t2d_idxs = t2d_idxs[is_match] prev_ids = prev_ids[is_match] ids[t2d_idxs.tolist()] = prev_ids if cfg.prop_fn: raise NotImplementedError if cfg.use_reid: # long-term associtation valid_dets = ids < 0 if cfg.clean_before_long_assign: valid_dets *= det_bboxes[:, -1] > cfg.new_obj_score_thre long_scores = asso_scores * cat_same valid_embeds = self.vanish_frames < cfg.long_term_frames if prev_ids.shape[0] > 0: valid_embeds[prev_ids] = 0 long_scores[:, 1:] *= valid_embeds.float().view(-1, 1).repeat( 1, long_scores.size(0)).transpose(1, 0) if valid_dets.any() and (long_scores[valid_dets, :] > 0).any(): valid_d_idxs = torch.nonzero(valid_dets == 1).squeeze(1) d2t_idxs = max_matching(long_scores[valid_d_idxs, :].t()) - 1 is_match = d2t_idxs >= 0 ids[valid_d_idxs[is_match]] = d2t_idxs[is_match] # new objects valid_dets = ids < 0 valid_dets *= det_bboxes[:, -1] > cfg.new_obj_score_thre valid_idxs = torch.nonzero(valid_dets > 0).squeeze(1).tolist() if len(valid_idxs) > 0: for i, valid_idx in enumerate(valid_idxs): ids[valid_idx] = self.embeddings.size(0) + i new_track_bboxes = det_bboxes[valid_idxs, :] new_track_labels = det_labels[valid_idxs] vanish_frames = torch.zeros_like(new_track_labels) bbox_embeds, track_embeds = self.get_new_embeds( x, new_track_bboxes, img_meta, rescale) self.update(type='contact', embeddings=track_embeds, bbox_embeds=bbox_embeds, tracklet_scores=new_track_bboxes[:, -1].detach().clone(), bboxes=new_track_bboxes.detach().clone(), labels=new_track_labels, vanish_frames=vanish_frames) return ids
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): self.global_step += 1 x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) # start creating features for input pos_inds = bbox_targets[0] > 0 cls_score_post_softmax = cls_score.softmax(dim=1) pos_probs_single_item = cls_score_post_softmax[pos_inds, bbox_targets[0][pos_inds]] pos_bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)[pos_inds, bbox_targets[0][pos_inds]] # simple trick to remove NaN values pos_bbox_pred[pos_bbox_pred != pos_bbox_pred] = 0 pos_gts = torch.cat([k.pos_gt_bboxes for k in sampling_results], dim=0) pos_proposal = torch.cat([k.pos_bboxes for k in sampling_results], dim=0) target_means = self.bbox_head.target_means target_stds = self.bbox_head.target_stds pos_bbox = delta2bbox(pos_proposal, pos_bbox_pred, means=target_means, stds=target_stds) pos_ious = bbox_overlaps(pos_gts, pos_bbox, is_aligned=True) pos_ious = pos_ious.view(-1, ) total_ious = pos_ious.new_full((pos_inds.numel(),), 0.0) total_ious[pos_inds] = pos_ious total_probs_single_item = pos_probs_single_item.new_full((pos_inds.numel(),), 0.0) total_probs_single_item[pos_inds] = pos_probs_single_item with torch.no_grad(): loss_bbox_as_features = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets, reduction_override="none") cls_loss_as_feature = loss_bbox_as_features["loss_cls"].detach().data bbox_loss_as_feature = loss_bbox_as_features["loss_bbox"].detach().data if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels, sample_weights=None) loss_mask = loss_mask.mean(dim=1).mean(dim=1).view(-1, 1) losses.update({ "pos_cls_loose_value": cls_loss_as_feature[pos_inds].mean(), "neg_cls_loose_value": cls_loss_as_feature[~pos_inds].mean(), "reg_loose_value": bbox_loss_as_feature.sum(dim=1).mean(), }) bbox_loss_as_feature_full = torch.zeros((pos_inds.numel(), bbox_loss_as_feature.shape[1]), device=bbox_loss_as_feature.device).type( bbox_loss_as_feature.type()) bbox_loss_as_feature_full[pos_inds] = bbox_loss_as_feature loss_mask_as_feature_full = torch.zeros((pos_inds.numel(), 1), device=loss_mask.device).type( loss_mask.type()) loss_mask_as_feature_full[pos_inds] = loss_mask uncertainty_prediction = self.uncertainty_predictor(total_ious.detach().data, total_probs_single_item.detach().data, cls_loss_as_feature, bbox_loss_as_feature_full, loss_mask_as_feature_full) uncertainty_prediction_cls = uncertainty_prediction[:, 0] uncertainty_prediction_reg = uncertainty_prediction[:, 1] uncertainty_prediction_mask = uncertainty_prediction[:, 2] uncertainty_prediction_cls = torch.clamp(uncertainty_prediction_cls, min=self.cls_prediction_min, max=self.cls_prediction_max) uncertainty_prediction_reg = torch.clamp(uncertainty_prediction_reg, min=self.reg_prediction_min, max=self.reg_prediction_max) uncertainty_prediction_mask = torch.clamp(uncertainty_prediction_mask, min=self.reg_prediction_min, max=self.reg_prediction_max) negative_avg = uncertainty_prediction_cls[~pos_inds].mean() uncertainty_prediction_cls[~pos_inds] = torch.ones_like( uncertainty_prediction_cls[~pos_inds]) * negative_avg positive_avg = uncertainty_prediction_cls[pos_inds].mean() uncertainty_prediction_cls[pos_inds] = torch.ones_like( uncertainty_prediction_cls[pos_inds]) * positive_avg uncertainty_prediction_cls_for_regularization_pos = (uncertainty_prediction_cls[ pos_inds].mean() * self.uncertainty_cls_weight) uncertainty_prediction_cls_for_regularization_neg = (uncertainty_prediction_cls[ ~pos_inds].mean() * self.negative_regularization) losses.update({ "loss_uncertainty_cls_pos": uncertainty_prediction_cls_for_regularization_pos}) losses.update({ "loss_uncertainty_cls_neg": uncertainty_prediction_cls_for_regularization_neg}) losses.update({ "loss_uncertainty_reg": uncertainty_prediction_reg[ pos_inds].mean() * self.uncertainty_reg_weight}) losses.update({ "loss_uncertainty_mask": uncertainty_prediction_mask[ pos_inds].mean() * self.uncertainty_mask_weight}) uncertainty_prediction_reg = torch.exp(-1. * uncertainty_prediction_reg) uncertainty_prediction_cls = torch.exp(-1. * uncertainty_prediction_cls) uncertainty_prediction_mask = torch.exp(-1. * uncertainty_prediction_mask) losses.update({ "cls_prediction_pos": uncertainty_prediction_cls[pos_inds].mean(), "cls_prediction_neg": uncertainty_prediction_cls[~pos_inds].mean(), "cls_prediction_reg": uncertainty_prediction_reg[pos_inds].mean(), }) uncertainty_prediction_mask_pos = uncertainty_prediction_mask[pos_inds] bbox_targets_weighted = [m.detach().data for m in bbox_targets] bbox_targets_weighted[3] = bbox_targets_weighted[3] * uncertainty_prediction_reg.view(-1, 1) bbox_targets_weighted[1] = bbox_targets_weighted[1] * uncertainty_prediction_cls.view(-1, ) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets_weighted) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels, sample_weights=uncertainty_prediction_mask_pos) losses.update(loss_mask) return losses