def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert 'pred_boxes' in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs['pred_boxes'][idx] target_boxes = torch.cat( [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) src_boxes_coordinates = src_boxes[:, :2] src_boxes_dimensions = src_boxes[:, 2:] target_boxes_coordinates = target_boxes[:, :2] target_boxes_dimensions = target_boxes[:, 2:] loss_bbox_coordinates = F.l1_loss(src_boxes_coordinates, target_boxes_coordinates, reduction='none') loss_bbox_dimensions = F.l1_loss(src_boxes_dimensions, target_boxes_dimensions, reduction='none') losses = {} losses['loss_bbox_coordinates'] = loss_bbox_coordinates.sum( ) / num_boxes losses['loss_bbox_dimensions'] = loss_bbox_dimensions.sum() / num_boxes loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes))) losses['loss_giou'] = loss_giou.sum() / num_boxes return losses
def loss_boxes(self, outputs, gt_instances: List[Instances], indices: List[tuple], num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size. """ # We ignore the regression loss of the track-disappear slots. #TODO: Make this filter process more elegant. filtered_idx = [] for src_per_img, tgt_per_img in indices: keep = tgt_per_img != -1 filtered_idx.append((src_per_img[keep], tgt_per_img[keep])) indices = filtered_idx idx = self._get_src_permutation_idx(indices) src_boxes = outputs['pred_boxes'][idx] target_boxes = torch.cat([gt_per_img.boxes[i] for gt_per_img, (_, i) in zip(gt_instances, indices)], dim=0) # for pad target, don't calculate regression loss, judged by whether obj_id=-1 target_obj_ids = torch.cat([gt_per_img.obj_ids[i] for gt_per_img, (_, i) in zip(gt_instances, indices)], dim=0) # size(16) mask = (target_obj_ids != -1) loss_bbox = F.l1_loss(src_boxes[mask], target_boxes[mask], reduction='none') loss_giou = 1 - torch.diag(box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes[mask]), box_ops.box_cxcywh_to_xyxy(target_boxes[mask]))) losses = {} losses['loss_bbox'] = loss_bbox.sum() / num_boxes losses['loss_giou'] = loss_giou.sum() / num_boxes return losses
def _add_fp_tracks(self, track_instances: Instances, active_track_instances: Instances) -> Instances: inactive_instances = track_instances[track_instances.obj_idxes < 0] # add fp for each active track in a specific probability. fp_prob = torch.ones_like( active_track_instances.scores) * self.fp_ratio selected_active_track_instances = active_track_instances[ torch.bernoulli(fp_prob).bool()] if len(inactive_instances) > 0 and len( selected_active_track_instances) > 0: num_fp = len(selected_active_track_instances) if num_fp >= len(inactive_instances): fp_track_instances = inactive_instances else: inactive_boxes = Boxes( box_ops.box_cxcywh_to_xyxy(inactive_instances.pred_boxes)) selected_active_boxes = Boxes( box_ops.box_cxcywh_to_xyxy( selected_active_track_instances.pred_boxes)) ious = pairwise_iou(inactive_boxes, selected_active_boxes) # select the fp with the largest IoU for each active track. fp_indexes = ious.max(dim=0).indices # remove duplicate fp. fp_indexes = torch.unique(fp_indexes) fp_track_instances = inactive_instances[fp_indexes] merged_track_instances = Instances.cat( [active_track_instances, fp_track_instances]) return merged_track_instances return active_track_instances
def loss_boxes(self, outputs, targets, indices, num_boxes): """ Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert "pred_boxes" in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs["pred_boxes"].numpy()[ idx[0].numpy(), idx[1].numpy(), :] # [num_objects, 4] src_boxes = dg.to_variable(src_boxes) target_boxes = [ t["boxes"].numpy()[i.numpy()] for t, (_, i) in zip(targets, indices) ] target_boxes = [dg.to_variable(t) for t in target_boxes] target_boxes = L.concat(target_boxes, 0).astype("float32") # [num_objects, 4] loss_bbox = F.loss.l1_loss(src_boxes, target_boxes, reduction="sum") losses = {} losses["loss_bbox"] = loss_bbox / num_boxes num_boxes = src_boxes.shape[0] mask = T.creation.diag(dg.to_variable( np.ones(num_boxes))) # mask out non-diag element loss_giou = (1 - box_ops.generalied_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes))) * mask losses["loss_giou"] = L.reduce_sum(loss_giou) / num_boxes return losses
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).softmax( -1) # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten( 0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. # 在out_prob中,dim1中每一维都代表了对每个该bbox的所属类别的概率,由于有92个类,所以有92个数字 # 由于candidate box远比实际的box数量要多,因此并不知道到底哪个candidate能与gt box进行匹配 # 所以先获取所有tgt_id,并在out_ptob中取出对应的概率,因为知道在众多candidate中必有一个bbox与某个gt bbox最为匹配 # 之所以用减号就是想知道与理想概率1的差距,但这里加不加1其实无所谓 cost_class = -out_prob[:, tgt_ids] # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) ] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def forward(self, outputs, targets, positive_map): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = self.norm(outputs["pred_logits"].flatten( 0, 1)) # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten( 0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_bbox = torch.cat([v["boxes"] for v in targets]) assert len(tgt_bbox) == len(positive_map) # Compute the soft-cross entropy between the predicted token alignment and the GT one for each box cost_class = -(out_prob.unsqueeze(1) * positive_map.unsqueeze(0)).sum(-1) # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) assert cost_class.shape == cost_bbox.shape # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) ] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] # Compute the L1 cost between boxes # Note cdist with p=1 is the manhattan distance norm # cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) out_bbox_coordinates = out_bbox[:,:2] out_bbox_dimensions = out_bbox[:,2:] tgt_bbox_coordinates = tgt_bbox[:,:2] tgt_bbox_dimensions = tgt_bbox[:,2:] cost_bbox_coordinates = torch.cdist(out_bbox_coordinates, tgt_bbox_coordinates, p=1) cost_bbox_dimensions = torch.cdist(out_bbox_dimensions, tgt_bbox_dimensions, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox_coordinates * cost_bbox_coordinates + self.cost_bbox_dimensions * cost_bbox_dimensions + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ with torch.no_grad(): bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. alpha = 0.25 gamma = 2.0 neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def get_cls_loss(self, outputs, targets, criterion, cls_losses, weights=None): """ """ # TODO: make sure we are not backpropagating from here # (probably from the function that will call this(with no grad) outputs_without_aux = { k: v for k, v in outputs.items() if k != 'aux_outputs' } indices = criterion.matcher(outputs_without_aux, targets) src_idx = criterion._get_src_permutation_idx(indices) # order the labels by the indices target_classes = torch.cat( [t["labels"][J] for t, (_, J) in zip(targets, indices)]) # BOXES src_logits = outputs['pred_logits'][src_idx] # (BOXES) X C # order the bounding boxes by the indices target_boxes = torch.cat( [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) # BOXES src_boxes = outputs['pred_boxes'][src_idx] # BOXES classes = torch.unique(target_classes) for cls in classes: idx = torch.where(target_classes == cls)[0] loss_ce = F.cross_entropy(src_logits[idx], target_classes[idx], reduction='sum') loss_bbox = F.l1_loss(src_boxes[idx], target_boxes[idx], reduction='sum') loss_giou = (1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes[idx]), box_ops.box_cxcywh_to_xyxy(target_boxes[idx])))).sum() losses = torch.tensor([len(idx), loss_ce, loss_bbox, loss_giou]) cls_losses[cls] += losses return cls_losses
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] track_logits, track_bbox = outputs['tracking_logits'], outputs[ 'tracking_boxes'] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = out_logits.sigmoid() track_prob = track_logits.sigmoid() # topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) # scores = topk_values # topk_boxes = topk_indexes // out_logits.shape[2] # labels = topk_indexes % out_logits.shape[2] # boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4)) scores, labels = prob[..., 1:2].max(-1) labels = labels + 1 boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) track_scores, track_labels = track_prob[..., 1:2].max(-1) track_labels = track_labels + 1 track_boxes = box_ops.box_cxcywh_to_xyxy(track_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] track_boxes = track_boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b, 'track_scores': ts, 'track_labels': tl, 'track_boxes': tb } for s, l, b, ts, tl, tb in zip(scores, labels, boxes, track_scores, track_labels, track_boxes)] return results
def inference(self, box_cls, box_pred, mask_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] scores, labels = F.softmax(box_cls, axis=-1)[:, :, :-1].max(-1) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate( zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = Boxes(box_cxcywh_to_xyxy(box_pred_per_image)) result.pred_boxes.scale(scale_x=image_size[1], scale_y=\ image_size[0]) if self.mask_on: mask = F.interpolate(mask_pred[i].unsqueeze(0), size=\ image_size, mode='bilinear', align_corners=False) mask = mask[0].sigmoid() > 0.5 B, N, H, W = mask_pred.shape mask = BitMasks(mask.cpu()).crop_and_resize( result.pred_boxes.tensor.cpu(), 32) result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def inference(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for scores_per_image, labels_per_image, box_pred_per_image, image_size in zip( scores, labels, box_pred, image_sizes): result = Instances(image_size) result.pred_boxes = Boxes(box_cxcywh_to_xyxy(box_pred_per_image)) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) # convert to [x0, y0, x1, y1] format boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores, labels, boxes)] return results
def add_mask(img_list, batched_boxes, batched_masks): new_img_list = [] for im, boxes, masks in zip(img_list, batched_boxes, batched_masks): img_h, img_w = im.shape[-2:] boxes = box_cxcywh_to_xyxy(boxes) multiplier = torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).cuda() boxes = boxes * multiplier boxes = boxes.int().clamp(min=0) for i, (box, mask) in enumerate(zip(boxes, masks)): box[3] = min(img_h, box[3]) box[2] = min(img_w, box[2]) dh = box[3] - box[1] dw = box[2] - box[0] conv_mask = F.interpolate(mask.view((1, 1) + mask.shape), size=(dh, dw), mode='bilinear') th_mask = conv_mask > 0.5 if th_mask.sum() == 0: continue try: im[:, box[1]:box[3], box[0]:box[2]][th_mask[0].repeat(len(im), 1, 1)] = -1000 except BaseException: pdb.set_trace() new_img_list.append(im) return new_img_list
def forward(self, track_instances: Instances, target_size) -> Instances: """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits = track_instances.pred_logits out_bbox = track_instances.pred_boxes prob = out_logits.sigmoid() # prob = out_logits[...,:1].sigmoid() scores, labels = prob.max(-1) # convert to [x0, y0, x1, y1] format boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_size scale_fct = torch.Tensor([img_w, img_h, img_w, img_h]).to(boxes) boxes = boxes * scale_fct[None, :] track_instances.boxes = boxes track_instances.scores = scores track_instances.labels = labels track_instances.remove('pred_logits') track_instances.remove('pred_boxes') return track_instances
def showImage(img, target): from PIL import Image, ImageDraw, ImageFont from util.box_ops import box_xyxy_to_cxcywh, box_cxcywh_to_xyxy draw = ImageDraw.Draw(img) boxes = target['boxes'] cl = target['labels'] if 1:#boxes.max() <= 1: boxes = box_cxcywh_to_xyxy(boxes) print('Image:', (img.height, img.width), 'true:', target['size'], target['orig_size']) H, W = target['size'] #W, H = img.width, img.height boxes[:, 0::2] *= W boxes[:, 1::2] *= H for i in range(len(boxes)): x1, y1, x2, y2 = boxes[i] draw.rectangle((x1, y1, x2, y2), outline=(0, 255, 0) if cl[i] >= 0 else (0, 0, 0), width=3) draw.text((x1, y1), str(cl[i].item()), (0, 255, 0) if cl[i] >= 0 else (0, 0, 0), font=ImageFont.truetype("DejaVuSansMono.ttf", 30)) img.show()
def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert 'human_pred_boxes' in outputs assert 'object_pred_boxes' in outputs idx = self._get_src_permutation_idx(indices) human_src_boxes = outputs['human_pred_boxes'][idx] human_target_boxes = torch.cat( [t['human_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) object_src_boxes = outputs['object_pred_boxes'][idx] object_target_boxes = torch.cat( [t['object_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) human_loss_bbox = F.l1_loss(human_src_boxes, human_target_boxes, reduction='none') object_loss_bbox = F.l1_loss(object_src_boxes, object_target_boxes, reduction='none') losses = dict() losses['human_loss_bbox'] = human_loss_bbox.sum() / num_boxes losses['object_loss_bbox'] = object_loss_bbox.sum() / num_boxes losses['loss_bbox'] = losses['human_loss_bbox'] + losses[ 'object_loss_bbox'] human_loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(human_src_boxes), box_ops.box_cxcywh_to_xyxy(human_target_boxes))) object_loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(object_src_boxes), box_ops.box_cxcywh_to_xyxy(object_target_boxes))) losses['human_loss_giou'] = human_loss_giou.sum() / num_boxes losses['object_loss_giou'] = object_loss_giou.sum() / num_boxes losses['loss_giou'] = losses['human_loss_giou'] + losses[ 'object_loss_giou'] return losses
def forward(self, features, boxes): b, q, _ = boxes.shape features = [features[f] for f in self.box_in_features] box_features = self.box_pooler( features, [Boxes(box_ops.box_cxcywh_to_xyxy(x)) for x in boxes]) box_features = self.box_head(box_features) predictions = self.box_predictor(box_features) return predictions.view(b, q, self.num_classes + 1)
def forward(self, outputs, targets): bs, num_queries = outputs['pred_obj_logits'].shape[:2] out_obj_prob = outputs['pred_obj_logits'].flatten(0, 1).softmax(-1) out_verb_prob = outputs['pred_verb_logits'].flatten(0, 1).sigmoid() out_sub_bbox = outputs['pred_sub_boxes'].flatten(0, 1) out_obj_bbox = outputs['pred_obj_boxes'].flatten(0, 1) tgt_obj_labels = torch.cat([v['obj_labels'] for v in targets]) tgt_verb_labels = torch.cat([v['verb_labels'] for v in targets]) tgt_verb_labels_permute = tgt_verb_labels.permute(1, 0) tgt_sub_boxes = torch.cat([v['sub_boxes'] for v in targets]) tgt_obj_boxes = torch.cat([v['obj_boxes'] for v in targets]) cost_obj_class = -out_obj_prob[:, tgt_obj_labels] tgt_verb_labels_permute = tgt_verb_labels.permute(1, 0) cost_verb_class = -(out_verb_prob.matmul(tgt_verb_labels_permute) / \ (tgt_verb_labels_permute.sum(dim=0, keepdim=True) + 1e-4) + \ (1 - out_verb_prob).matmul(1 - tgt_verb_labels_permute) / \ ((1 - tgt_verb_labels_permute).sum(dim=0, keepdim=True) + 1e-4)) / 2 cost_sub_bbox = torch.cdist(out_sub_bbox, tgt_sub_boxes, p=1) cost_obj_bbox = torch.cdist(out_obj_bbox, tgt_obj_boxes, p=1) * (tgt_obj_boxes != 0).any(dim=1).unsqueeze(0) if cost_sub_bbox.shape[1] == 0: cost_bbox = cost_sub_bbox else: cost_bbox = torch.stack((cost_sub_bbox, cost_obj_bbox)).max(dim=0)[0] cost_sub_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_sub_bbox), box_cxcywh_to_xyxy(tgt_sub_boxes)) cost_obj_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_obj_bbox), box_cxcywh_to_xyxy(tgt_obj_boxes)) + \ cost_sub_giou * (tgt_obj_boxes == 0).all(dim=1).unsqueeze(0) if cost_sub_giou.shape[1] == 0: cost_giou = cost_sub_giou else: cost_giou = torch.stack((cost_sub_giou, cost_obj_giou)).max(dim=0)[0] C = self.cost_obj_class * cost_obj_class + self.cost_verb_class * cost_verb_class + \ self.cost_bbox * cost_bbox + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v['obj_labels']) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def forward(self, outputs, target_sizes): out_obj_logits, out_verb_logits, out_sub_boxes, out_obj_boxes = outputs['pred_obj_logits'], \ outputs['pred_verb_logits'], \ outputs['pred_sub_boxes'], \ outputs['pred_obj_boxes'] assert len(out_obj_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 obj_prob = F.softmax(out_obj_logits, -1) obj_scores, obj_labels = obj_prob[..., :-1].max(-1) verb_scores = out_verb_logits.sigmoid() img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(verb_scores.device) sub_boxes = box_cxcywh_to_xyxy(out_sub_boxes) sub_boxes = sub_boxes * scale_fct[:, None, :] obj_boxes = box_cxcywh_to_xyxy(out_obj_boxes) obj_boxes = obj_boxes * scale_fct[:, None, :] results = [] for os, ol, vs, sb, ob in zip(obj_scores, obj_labels, verb_scores, sub_boxes, obj_boxes): sl = torch.full_like(ol, self.subject_category_id) l = torch.cat((sl, ol)) b = torch.cat((sb, ob)) results.append({'labels': l.to('cpu'), 'boxes': b.to('cpu')}) vs = vs * os.unsqueeze(1) ids = torch.arange(b.shape[0]) results[-1].update({ 'verb_scores': vs.to('cpu'), 'sub_ids': ids[:ids.shape[0] // 2], 'obj_ids': ids[ids.shape[0] // 2:] }) return results
def loss_boxes(self, outputs, targets, positive_map, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size. """ assert "pred_boxes" in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs["pred_boxes"][idx] target_boxes = torch.cat( [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") losses = {} losses["loss_bbox"] = loss_bbox.sum() / num_boxes loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes))) losses["loss_giou"] = loss_giou.sum() / num_boxes return losses
def loss_sub_obj_boxes(self, outputs, targets, indices, num_interactions): assert 'pred_sub_boxes' in outputs and 'pred_obj_boxes' in outputs idx = self._get_src_permutation_idx(indices) src_sub_boxes = outputs['pred_sub_boxes'][idx] src_obj_boxes = outputs['pred_obj_boxes'][idx] target_sub_boxes = torch.cat( [t['sub_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) target_obj_boxes = torch.cat( [t['obj_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) exist_obj_boxes = (target_obj_boxes != 0).any(dim=1) losses = {} if src_sub_boxes.shape[0] == 0: losses['loss_sub_bbox'] = src_sub_boxes.sum() losses['loss_obj_bbox'] = src_obj_boxes.sum() losses['loss_sub_giou'] = src_sub_boxes.sum() losses['loss_obj_giou'] = src_obj_boxes.sum() else: loss_sub_bbox = F.l1_loss(src_sub_boxes, target_sub_boxes, reduction='none') loss_obj_bbox = F.l1_loss(src_obj_boxes, target_obj_boxes, reduction='none') losses['loss_sub_bbox'] = loss_sub_bbox.sum() / num_interactions losses['loss_obj_bbox'] = (loss_obj_bbox * exist_obj_boxes.unsqueeze(1)).sum() / ( exist_obj_boxes.sum() + 1e-4) loss_sub_giou = 1 - torch.diag( generalized_box_iou(box_cxcywh_to_xyxy(src_sub_boxes), box_cxcywh_to_xyxy(target_sub_boxes))) loss_obj_giou = 1 - torch.diag( generalized_box_iou(box_cxcywh_to_xyxy(src_obj_boxes), box_cxcywh_to_xyxy(target_obj_boxes))) losses['loss_sub_giou'] = loss_sub_giou.sum() / num_interactions losses['loss_obj_giou'] = (loss_obj_giou * exist_obj_boxes ).sum() / (exist_obj_boxes.sum() + 1e-4) return losses
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs['pred_logits'].shape[:2] out_prob = outputs['pred_logits'].flatten(0, 1).softmax(-1) out_bbox = outputs['pred_boxes'].flatten(0, 1) tgt_ids = torch2paddle.concat([v['labels'] for v in targets]) tgt_bbox = torch2paddle.concat([v['boxes'] for v in targets]) cost_class = -out_prob[:, tgt_ids] cost_bbox = paddle.dist(out_bbox, tgt_bbox, p=1) cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) C = (self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou) C = C.view(bs, num_queries, -1).cpu() sizes = [len(v['boxes']) for v in targets] indices = [ linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) ] return [(paddle.to_tensor(i, dtype=torch.int64), paddle.to_tensor(j, dtype=torch.int64)) for i, j in indices]
def init(self, img, bbox): """ args: img(np.ndarray): BGR image bbox: (x, y, w, h) bbox (opencv format for rect) """ bbox_xyxy = [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] self.center_pos = torch.Tensor([bbox[0] + bbox[2]/2, bbox[1] + bbox[3]/2]) self.size = torch.Tensor(bbox[2:]) channel_avg = np.mean(img, axis=(0, 1)) # get crop s_z, scale_z = siamfc_like_scale(bbox_xyxy) template_image, _ = crop_image(img, bbox_xyxy, padding = channel_avg) self.rect_template_image = template_image.copy() init_bbox = np.array(self.size) * scale_z exemplar_size = get_exemplar_size() x1 = np.round(exemplar_size/2 - init_bbox[0]/2).astype(np.uint8) y1 = np.round(exemplar_size/2 - init_bbox[1]/2).astype(np.uint8) x2 = np.round(exemplar_size/2 + init_bbox[0]/2).astype(np.uint8) y2 = np.round(exemplar_size/2 + init_bbox[1]/2).astype(np.uint8) cv2.rectangle(self.rect_template_image, (x1, y1), (x2, y2), (0,255,0), 3) # get mask self.init_template_mask = [0, 0, template_image.shape[0], template_image.shape[1]] if self.center_pos[0] < s_z/2: self.init_template_mask[0] = (s_z/2 - self.center_pos[0]) * scale_z if self.center_pos[1] < s_z/2: self.init_template_mask[1] = (s_z/2 - self.center_pos[1]) * scale_z if self.center_pos[0] + s_z/2 > img.shape[1]: self.init_template_mask[2] = self.init_template_mask[2] - (self.center_pos[0] + s_z/2 - img.shape[1]) * scale_z if self.center_pos[1] + s_z/2 > img.shape[0]: self.init_template_mask[3] = self.init_template_mask[3] - (self.center_pos[1] + s_z/2 - img.shape[0]) * scale_z # normalize and conver to torch.tensor self.init_template = self.image_normalize(np.round(template_image).astype(np.uint8)).cuda() self.first_frame = True self.init_best_score = 0 # for visualize debug_bbox = torch.round(box_cxcywh_to_xyxy(torch.cat([torch.tensor([63.5, 63.5]), torch.Tensor([bbox[2], bbox[3]]) * scale_z]))).int().numpy() debug_image = cv2.rectangle(template_image, (debug_bbox[0], debug_bbox[1]), (debug_bbox[2], debug_bbox[3]),(0,255,0),3) return {'template_image': debug_image}
def loss_boxes(self, outputs, targets, indices, num_boxes, boxes, visible, rnn_weight=0.5): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size. """ assert 'pred_boxes' in outputs # pred_boxes needs to be associated with final target. This is largest distance between frames, for the Transformer. idx = self._get_src_permutation_idx(indices) src_boxes = outputs['pred_boxes'][idx] target_boxes = torch.cat( [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') losses = {} losses['loss_bbox'] = loss_bbox.sum() / num_boxes losses['loss_giou_circuit'] = torch.tensor(0) losses['loss_iou_circuit'] = torch.tensor(0) giou, iou = box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes)) giou = torch.diag(giou) iou = torch.diag(iou) loss_giou = 1 - giou iou = iou losses['loss_giou'] = loss_giou.sum() / num_boxes losses['iou'] = iou.sum() / num_boxes return losses
def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert 'pred_boxes' in outputs # (batch indices, query indices) # shape都是(num_matched_queries1+num_matched_queries2+...) idx = self._get_src_permutation_idx(indices) # outputs['pred_boxes']的shape是(b,num_queries=100,4) # src_boxes的shape是(num_matched_queries1+num_matched_queries2+...,4) src_boxes = outputs['pred_boxes'][idx] # (num_matched_objs1+num_matched_objs2+...,4) # num_matched_queries1+num_matched_queries2+..., 和 num_matched_objs1+num_matched_objs2+... # 是相等的,在forward部分的matcher的返回结果注释中有说明。 target_boxes = torch.cat( [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) # 以下就是loss的计算。注意下 reduction 参数,若不显式进行设置,在Pytorch的实现中默认是'mean',即返回所有涉及误差计算的元素的均值。 loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') losses = {} # num_boxes是一个batch图像中目标物体的数量 losses['loss_bbox'] = loss_bbox.sum() / num_boxes # 由于generalized_box_iou返回的是每个预测结果与每个GT的giou,因此取对角线代表获取的是相互匹配的预测结果与GT的giou。 # 在计算GIoU loss时,使用了torch.diag()获取对角线元素,这是因为generalized_box_iou()方法返回 # 的是所有预测框与所有GT的GIoU,比如预测框有N个,GT有M个,那么返回结果就是NxM个GIoU。我们预先对 # 匹配的预测框和GT进行了排列,即N个预测框中的第1个匹配M个GT中的第1个,N中第2个匹配M中第2个,..,N中 # 第i个匹配M中第i个,于是我们要取相互匹配的那一项来计算loss。 loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes))) losses['loss_giou'] = loss_giou.sum() / num_boxes return losses
def forward(self, outputs, target_sizes, state='train'): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] # assert len(out_logits) == len(target_sizes) # assert target_sizes.shape[1] == 2 prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view( out_logits.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) # and from relative [0, 1] to absolute [0, height] coordinates # img_h, img_w = target_sizes.unbind(1) img_h, img_w = target_sizes[0], target_sizes[1] img_h = torch.tensor(img_h, device='cuda') img_w = torch.tensor(img_w, device='cuda') # scale_fct = torch.stack() if state == 'train': scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) # train可以用 boxes = boxes * scale_fct[:, None, :] #train用 else: scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=0) #val可以用 boxes = boxes * scale_fct #val可以用 results = [{ 'scores': s, 'labels': l, 'boxes': b, 'topk_index': i } for s, l, b, i in zip(scores, labels, boxes, topk_boxes)] return results
def eval(self, postprocessors, thresh=0.1): self._state.net.eval() associate = pocket.utils.BoxAssociation(min_iou=0.5) meter = pocket.utils.DetectionAPMeter(80, algorithm='INT', nproc=10) num_gt = torch.zeros(80) if self._train_loader.batch_size != 1: raise ValueError( f"The batch size shoud be 1, not {self._train_loader.batch_size}" ) for image, target in tqdm(self._train_loader): image = pocket.ops.relocate_to_cuda(image) output = self._state.net(image) output = pocket.ops.relocate_to_cpu(output) scores, labels, boxes = postprocessors( output, target[0]['size'].unsqueeze(0))[0].values() keep = torch.nonzero(scores >= thresh).squeeze(1) scores = scores[keep] labels = labels[keep] boxes = boxes[keep] gt_boxes = target[0]['boxes'] # Denormalise ground truth boxes gt_boxes = box_ops.box_cxcywh_to_xyxy(gt_boxes) h, w = target[0]['size'] scale_fct = torch.stack([w, h, w, h]) gt_boxes *= scale_fct gt_labels = target[0]['labels'] for c in gt_labels: num_gt[c] += 1 # Associate detections with ground truth binary_labels = torch.zeros(len(labels)) unique_cls = labels.unique() for c in unique_cls: det_idx = torch.nonzero(labels == c).squeeze(1) gt_idx = torch.nonzero(gt_labels == c).squeeze(1) if len(gt_idx) == 0: continue binary_labels[det_idx] = associate( gt_boxes[gt_idx].view(-1, 4), boxes[det_idx].view(-1, 4), scores[det_idx].view(-1)) meter.append(scores, labels, binary_labels) meter.num_gt = num_gt.tolist() return meter.eval(), meter.max_rec
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] batchsize = outputs['batchsize'] num_episode = outputs['num_episode'] num_queries = outputs['num_queries'] num_classes = outputs['num_classes'] out_logits = out_logits.view(batchsize, num_episode * num_queries, num_classes) out_bbox = out_bbox.view(batchsize, num_episode * num_queries, 4) assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view( out_logits.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores, labels, boxes)] return results
def forward(self, outputs, target_sizes): """Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) labels = torch.ones_like(labels) scores = 1 - prob[:, :, -1] # convert to [x0, y0, x1, y1] format boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] assert len(scores) == len(labels) == len(boxes) results = [{ "scores": s, "labels": l, "boxes": b } for s, l, b in zip(scores, labels, boxes)] if "pred_isfinal" in outputs: is_final = outputs["pred_isfinal"].sigmoid() scores_refexp = scores * is_final.view_as(scores) assert len(results) == len(scores_refexp) for i in range(len(results)): results[i]["scores_refexp"] = scores_refexp[i] return results