def forward(self, outputs, targets): """ Performs the matching Params: "preds_prob": Tensor of dim [num_queries, num_classes] with the classification logits "preds_boxes": Tensor of dim [num_queries, 4] with the predicted box coordinates "gt_bboxes": Tensor of dim [num_target_boxes, 5] [x1, y1, x2, y2, label] Returns: list of tensor of dim [num_queries] with idx of corresponding GT, -1 for background, -2 for ignore """ result = [] for preds_prob, preds_boxes, t in zip(outputs['pred_logits'], outputs['pred_boxes'], targets): preds_prob = preds_prob.sigmoid() gt_bboxes = torch.cat((t['boxes'], t['labels'].unsqueeze(-1).float()),dim=-1) ig_bboxes = t['iboxes'] K = preds_prob.shape[0] target_gt = gt_bboxes.new_full((K,), self.NEGATIVE_TARGET, dtype=torch.int64) target_gt_iou = gt_bboxes.new_full((K,), 0) pos_mask = gt_bboxes.new_zeros((K,), dtype=torch.bool) if gt_bboxes.numel() > 0: tgt_ids = gt_bboxes[:, 4].long() tgt_bbox = gt_bboxes[:, :4] alpha = 0.25 gamma = 2.0 neg_cost_class = (1 - alpha) * (preds_prob ** gamma) * (-(1 - preds_prob + 1e-8).log()) pos_cost_class = alpha * ((1 - preds_prob) ** gamma) * (-(preds_prob + 1e-8).log()) cost_class = pos_cost_class - neg_cost_class cost_bbox = torch.cdist(preds_boxes, tgt_bbox, p=1) cost_giou, overlaps = generalized_box_iou_(box_cxcywh_to_xyxy(preds_boxes), box_cxcywh_to_xyxy(tgt_bbox)) cost_giou = -cost_giou C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.cpu() src_idx, tgt_idx = linear_sum_assignment(C) src_idx = torch.from_numpy(src_idx).to(device=gt_bboxes.device, dtype=torch.int64) tgt_idx = torch.from_numpy(tgt_idx).to(device=gt_bboxes.device, dtype=torch.int64) target_gt[src_idx] = tgt_idx target_gt_iou[src_idx] = overlaps[src_idx, tgt_idx] pos_mask[src_idx] = True if ig_bboxes.numel() > 0: ign_bbox = ig_bboxes[:, :4] overlaps = box_iof(box_cxcywh_to_xyxy(preds_boxes), box_cxcywh_to_xyxy(ign_bbox)) dt_to_ig_max, _ = overlaps.max(dim=1) ignored_dt_mask = dt_to_ig_max >= self.ignore_iou_thresh ignored_dt_mask = (ignored_dt_mask ^ (ignored_dt_mask & pos_mask)) target_gt[ignored_dt_mask] = self.IGNORE_TARGET result.append(target_gt) return result
def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size. """ assert 'pred_boxes' in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs['pred_boxes'][idx] # [#matched query, 4] in order target_boxes = torch.cat( [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) # [#boxes, 4] in order # print(src_boxes.size()) loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') losses = {} losses['loss_bbox'] = loss_bbox.sum() / num_boxes # diag since we have already matched each src boxes to its target loss_giou = 1 - torch.diag( generalized_box_iou( # [#matched query, #boxes] box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))) losses['loss_giou'] = loss_giou.sum() / num_boxes return losses
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. alpha = 0.25 gamma = 2.0 neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def inference_coco(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. prob = box_cls.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // box_cls.shape[2] labels = topk_indexes % box_cls.shape[2] box_pred = box_cxcywh_to_xyxy(box_pred) box_pred = torch.gather(box_pred, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate( zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) # result.pred_boxes = box_pred_per_image result.pred_boxes = Boxes(box_pred_per_image) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append({"instances": result}) # results.append(result) return results
def inference_ch(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. prob = box_cls.sigmoid() scores = prob #.squeeze(-1) # [bs, num_query, 1] labels = torch.ones_like( scores, dtype=torch.int64, device=scores.device) #.squeeze(-1) # [bs, num_query, 1] box_pred = box_cxcywh_to_xyxy(box_pred) img_h, img_w = image_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) box_pred = box_pred * scale_fct[:, None, :] for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate( zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = box_pred_per_image # result.pred_boxes = Boxes(box_pred_per_image) # result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append({"instances": result}) # results.append(result) return results
def forward(self, q, q_pos, k, k_pos, key_padding_mask=None, pos_centers=None, spatial_shape=None): M_ = self.dec_sampling_heads P_ = self.dec_sampling_points F_ = self.feature_levels N_, C_, S_ = k.shape # memoy of encoder L_ = q.shape[0] spatial_shape_, valid_sizes, valid_scales = spatial_shape # [bs, #level, 2] -> [1, nhead*bs, 1, #level, 2] valid_sizes = valid_sizes.view(1, N_, 1, F_, 2).repeat_interleave(M_, 1) valid_scales = 2 * valid_scales.view(1, N_, 1, F_, 2).repeat_interleave(M_, 1) value = self.value_conv(k.unsqueeze(-1)).squeeze(-1) value = value.masked_fill(key_padding_mask.view(N_, 1, S_), float(0)) spatial_splits = [H_ * W_ for H_, W_ in spatial_shape_] value_list = torch.split(value, spatial_splits, dim=-1) value_list = [ value_.view(N_ * M_, C_ // M_, H_, W_) for value_, (H_, W_) in zip(value_list, spatial_shape_) ] weights = self.sampling_weight(q).view(L_, N_ * M_, 1, F_ * P_).softmax(3) # [L, bs, C] -> [L, nhead*bs, #key, #level, 2] grids = self.sampling_locs(q).view(L_, N_ * M_, P_, F_, 2) # [N * nhead, L, 4] pos_centers = pos_centers.permute(1, 0, 2).sigmoid().repeat_interleave( M_, 0) ## # [bs * nhead, L, 2 (wh)] -> [L, bs * nhead, 1, 1, 2] wh = pos_centers[:, :, 2:].permute(1, 0, 2).view(L_, N_ * M_, 1, 1, 2) # [L, nhead*bs, #key, #level, 2] grid_pts = torch.zeros((L_, M_, P_, F_, 2), dtype=weights.dtype, device=weights.device) for h_i in range(M_): for i in range(self.dec_sampling_points): grid_pts[:, h_i, i, :, 0] = ((i % int(self.pool_resolution[1])) + 0.5) / self.pool_resolution[1] grid_pts[:, h_i, i, :, 1] = (h_i + 0.5) / M_ grid_pts = grid_pts.repeat(1, N_, 1, 1, 1) grid_pts *= wh # [N * nhead, L, 4] -> [L, bs*nhead, 1, 1, 2] boxes_xy = box_ops.box_cxcywh_to_xyxy(pos_centers)[:, :, :2].permute( 1, 0, 2).view(L_, N_ * M_, 1, 1, -1) grids = ((grids * wh / P_) + boxes_xy + grid_pts) * valid_scales - 1 # [L, bs*nhead, #key, #level, 2] -> [#level, bs*nhead, L, #key, 2] grids = grids.permute(3, 1, 0, 2, 4) samples_value_list = [ F.grid_sample(value, grids, mode='bilinear', padding_mode='zeros', align_corners=False) for value, grids in zip(value_list, grids) ] # [bs*nhead, C / nhead, L, #key*#level] samples_value = torch.cat(samples_value_list, -1) # [bs*nhead, 1, L, #level*key] weights = weights.permute(1, 2, 0, 3) # sum all keys on all level [bs*nhead, C / nhead, L] -> [L, N, C] output = torch.sum(samples_value * weights, -1).permute(2, 0, 1).view(L_, N_, C_) output = self.output_proj(output) # [#level, bs*nhead, #key, 2] -> [#level, bs, nhead, #level, #key, 2] -> [bs, L, #level, nhead, #key, 2] output_sample_pts = ((grids + 1.0) / 2.0).view(F_, N_, M_, L_, P_, 2).permute( 1, 3, 0, 2, 4, 5) # [bs*nhead, 1, L, #level*key] -> [bs, #level, #level, nhead, #key] output_sample_weights = weights.view(N_, M_, L_, F_, P_).permute(0, 2, 3, 1, 4) # concat weight to sampled weight on last dim, last dim contains cx cy weight output_sample_attn = torch.cat( (output_sample_pts, output_sample_weights[..., None]), -1) return output, output_sample_attn
def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, pos_centers=None, spatial_shape=None): output = tgt intermediate = [] intermediate_centers = [] intermediate_dec_attns = [] # intermediate_tgt = [] for lvl, layer in enumerate(self.layers): # intermediate_tgt.append(output) if self.dense_query is True: # work around for dense query implementation outputs_coord = pos_centers.permute(1, 0, 2).sigmoid() nquery = outputs_coord.size(1) tgt_masks = [] for pred in outputs_coord: tgt_masks_ = torch.zeros((nquery, nquery), device=pos_centers.device) boxes = box_cxcywh_to_xyxy(pred) giou_score = 1 - generalized_box_iou( boxes, boxes) score = giou_score top_idx = torch.sort(score, dim=-1)[1][:, :100] # returns a longtensor # _, top_idx = torch.topk(score, k=100, largest=False, sorted=True,dim=-1)#[nquery, topk] #torch.sort is faster on GPU tgt_masks_.scatter_(1, top_idx, 1.) tgt_masks.append(tgt_masks_) tgt_mask = torch.stack(tgt_masks, dim=0).repeat_interleave(8, 0) output, dec_attn = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos, pos_centers=pos_centers, spatial_shape=spatial_shape) if self.return_intermediate: intermediate.append(self.norm(output)) intermediate_centers.append(pos_centers) intermediate_dec_attns.append(dec_attn) if self.bbox_embed is not None: tmp = self.bbox_embed[lvl](self.norm(output)) new_pos_centers = tmp + pos_centers pos_centers = new_pos_centers.detach() if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate), torch.stack(intermediate_centers), torch.stack(intermediate_dec_attns)#torch.stack(intermediate_tgt) return output, pos_centers, dec_attn