def post_process_top_k(self, predicted_offsets, predicted_scores, score_threshold, iou_threshold, top_k): ''' return top_k detections sorted by confidence score Params: predicted_offsets: predicted offsets w.r.t the 8732 prior boxes, (gcxgcy), a tensor of dimensions (N, 8732, 4) predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) score_threshold: minimum threshold for a box to be considered a match for a certain class iou_threshold: maximum overlap two boxes can have so that the one with the lower score is not suppressed via NMS top_k: int, if the result contains more than k objects, just return k objects that have largest confidence score Return: detections: (boxes, labels, and scores), they are lists of N tensors boxes: N (n_boxes, 4) labels: N (n_boxes,) scores: N (n_boxes,) ''' boxes = list() labels = list() scores = list() N, n_priors = predicted_offsets.shape[0:2] predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes) # for each image in the batch for i in range(N): boxes_i = list() labels_i = list() scores_i = list() # convert gcxgcy to xy coordinates format boxes_xy = cxcy_to_xy(gcxgcy_to_cxcy(predicted_offsets[i], self.priors_cxcy)) # (8732, 4) for c in range(1, self.n_classes): # Keep only predicted boxes and scores where scores for this class are above the minimum score class_scores = predicted_scores[i][:, c] # (8732) qualify_mask = class_scores > score_threshold n_qualified = qualify_mask.sum().item() if n_qualified == 0: continue boxes_class_c = boxes_xy[qualify_mask] # (n_qualified, 4) boxes_score_class_c = class_scores[qualify_mask] # (n_qualified) <= 8732 final_box_ids = nms(boxes_class_c, boxes_score_class_c, iou_threshold) # (n_final_boxes,) boxes_i.extend(boxes_class_c[final_box_ids].tolist()) labels_i.extend([c]*len(final_box_ids)) scores_i.extend(boxes_score_class_c[final_box_ids].tolist()) boxes.append(torch.FloatTensor(boxes_i).to(device)) labels.append(torch.LongTensor(labels_i).to(device)) scores.append(torch.FloatTensor(scores_i).to(device)) # Filter top k objects that have largest confidence score if boxes[i].size(0) > top_k: scores[i], sort_ind = scores[i].sort(dim=0, descending=True) scores[i] = scores[i][:top_k] # (top_k) boxes[i] = boxes[i][sort_ind[:top_k]] # (top_k, 4) labels[i] = labels[i][sort_ind[:top_k]] # (top_k) return boxes, labels, scores
def my_post_process(self, predicted_offsets, predicted_scores, score_threshold, iou_threshold, top_k=-1): ''' The differences from the previous my_post_process are: 1: score_threshold is used to determine whether a box's class is background or objects E.g: let's say score_threshold=0.75, then boxes that have score of background class > 0.75 are considered background 2: and then if the box contains an object, the object labels will be the argmax of the softmax output, background excluded Result: # See precision recall curve for more information # 2 times faster than post_process_top_k since a lot of background boxes was filtered out by score_threshold ''' boxes = list() labels = list() scores = list() N, n_priors = predicted_offsets.shape[0:2] predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes) obj_masks = (predicted_scores[:,:,0] < score_threshold) # (N,8732) # for each image in the batch for i in range(N): boxes_i = list() labels_i = list() scores_i = list() obj_mask = obj_masks[i] # (8732) if obj_mask.sum().item() > 0: # filter out boxes that are background obj_boxes = predicted_offsets[i][obj_mask] # (n_obj_boxes, 4) # n_obj_boxes: number of boxes containing object obj_boxes_score, obj_boxes_class = predicted_scores[i,:,1:self.n_classes][obj_mask].max(dim=1) # (n_obj_boxes) obj_boxes_class += 1 #since we excluded background class, argmax is between 0-19, we need to add 1 -> 1-20 # convert to xy coordinates format obj_boxes = cxcy_to_xy(gcxgcy_to_cxcy(obj_boxes, self.priors_cxcy[obj_mask])) # (n_qualified_boxes, 4) # Non-max suppression for class_i in obj_boxes_class.unique(sorted=False).tolist(): class_mask = (obj_boxes_class == class_i) boxes_class_i = obj_boxes[class_mask] boxes_score_class_i = obj_boxes_score[class_mask] final_box_ids = nms(boxes_class_i, boxes_score_class_i, iou_threshold) # (n_final_boxes after suppresion,) boxes_i.extend(boxes_class_i[final_box_ids].tolist()) labels_i.extend([class_i]*len(final_box_ids)) scores_i.extend(boxes_score_class_i[final_box_ids].tolist()) boxes.append(torch.FloatTensor(boxes_i).to(device)) labels.append(torch.LongTensor(labels_i).to(device)) scores.append(torch.FloatTensor(scores_i).to(device)) # Filter top k objects that have largest confidence score if boxes[i].size(0) > top_k and top_k > 0: scores[i], sort_ind = scores[i].sort(dim=0, descending=True) scores[i] = scores[i][:top_k] # (top_k) boxes[i] = boxes[i][sort_ind[:top_k]] # (top_k, 4) labels[i] = labels[i][sort_ind[:top_k]] # (top_k) return boxes, labels, scores
def __init__(self, priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1.): super(MultiBoxLoss, self).__init__() self.priors_cxcy = priors_cxcy self.priors_xy = cxcy_to_xy(priors_cxcy) self.threshold = threshold self.neg_pos_ratio = neg_pos_ratio self.alpha = alpha self.smooth_l1 = nn.L1Loss() self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
def __init__(self, threshold, neg_pos_ratio, alpha, device): super().__init__() self.default_cxcy = get_default_boxes().to(device) self.default_xy = cxcy_to_xy(self.default_cxcy) self.threshold = threshold self.hard_neg_scale = neg_pos_ratio self.alpha = alpha self.device = device self.smooth_l1 = nn.SmoothL1Loss(reduction='none') self.cross_entropy = nn.CrossEntropyLoss(reduction='none')
def __init__(self, priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1., focal_loss=False): super(MultiBoxLoss, self).__init__() self.priors_cxcy = priors_cxcy self.priors_xy = cxcy_to_xy(priors_cxcy) self.threshold = threshold self.neg_pos_ratio = neg_pos_ratio self.alpha = alpha # loss functions self.smooth_l1 = nn.SmoothL1Loss() self.cross_entropy = nn.CrossEntropyLoss(reduction='none') self.focal_loss = FocalLoss(reduction='sum') if focal_loss else None
def __init__(self, priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1.): super(MultiBoxLoss, self).__init__() self.priors_cxcy = priors_cxcy self.priors_xy = utils.cxcy_to_xy(priors_cxcy) self.threshold = threshold self.neg_pos_ratio = neg_pos_ratio self.alpha = alpha self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.smooth_l1 = nn.L1Loss() self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
def detect(self, predicted_locs, predicted_scores, threshold, max_overlap): batch_size = predicted_locs.size(0) predicted_scores = torch.nn.functional.softmax( predicted_scores, dim=2) # (batch_size, 8732, 2) all_image_boxes = list() all_image_scores = list() for i in range(batch_size): decode_locs = cxcy_to_xy( gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy)) image_boxes = list() image_scores = list() text_scores = predicted_scores[i][:, 1] score_above_threshold = text_scores > threshold n_score_above_threshold = score_above_threshold.sum().item() text_scores = text_scores[score_above_threshold] text_decoded_locs = decode_locs[score_above_threshold] overlap = IoU(xy_to_cxcy(text_decoded_locs), xy_to_cxcy(text_decoded_locs)) suppress = torch.zeros(n_score_above_threshold, dtype=torch.uint8).to(device) for box in range(text_decoded_locs.size(0)): if suppress[box] == 1: continue suppress, _ = torch.max(suppress, overlap[box] > max_overlap) suppress[box] = 0 image_boxes.append(text_decoded_locs[1 - suppress]) image_scores.append(text_scores[1 - suppress]) if len(image_boxes) == 0: image_boxes.append( torch.FloatTensor([0., 0., 1., 1.]).to(device)) image_scores.append(torch.FloatTensor([0.]).to(device)) image_boxes = torch.cat(image_boxes, dim=0) image_scores = torch.cat(image_scores, dim=0) all_image_boxes.append(image_boxes) all_image_scores.append(image_scores) return all_image_boxes, all_image_scores
def __init__(self, threshold=0.5, neg_pos_ratio=3, alpha=1., device=DEVICE): super(MultiBoxLoss, self).__init__() self.priors_cxcy = get_default_boxes().to(device) self.priors_xy = cxcy_to_xy(self.priors_cxcy) self.threshold = threshold self.neg_pos_ratio = neg_pos_ratio self.alpha = alpha self.device = device self.smooth_l1 = nn.L1Loss() self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
def post_processing(self, pred, is_demo=False): if is_demo: self.assign_anchors_to_cpu() pred_loc = pred[0].to('cpu') pred_cls = pred[1].to('cpu') else: pred_loc = pred[0] pred_cls = pred[1] n_priors = self.center_anchor.size(0) assert n_priors == pred_loc.size(1) == pred_cls.size(1) # decode 에서 나온 bbox 는 center coord pred_bboxes = cxcy_to_xy(self.decode(pred_loc.squeeze())).clamp( 0, 1) # for batch 1, [67995, 4] pred_scores = pred_cls.squeeze() # for batch 1, [67995, num_classes] # corner coordinates 를 x1y1x2y2 를 0 ~ 1 로 scaling 해줌 # 0.3109697496017331 -> 0.3115717185294685 로 오름 return pred_bboxes, pred_scores
def detect_objects(self, predicted_locs, predicted_scores, min_score, max_overlap, top_k, device): """ Decipher the 8732 locations and class scores (output of ths SSD300) to detect objects. For each class, perform Non-Maximum Suppression (NMS) on boxes that are above a minimum threshold. :param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4) :param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) :param min_score: minimum threshold for a box to be considered a match for a certain class :param max_overlap: maximum overlap two boxes can have so that the one with the lower score is not suppressed via NMS :param top_k: if there are a lot of resulting detection across all classes, keep only the top 'k' :return: detections (boxes, labels, and scores), lists of length batch_size """ batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes) # Lists to store final predicted boxes, labels, and scores for all images all_images_boxes = list() all_images_labels = list() all_images_scores = list() assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) for i in range(batch_size): # Decode object coordinates from the form we regressed predicted boxes to decoded_locs = cxcy_to_xy( gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy) ) # (8732, 4), these are fractional pt. coordinates # Lists to store boxes and scores for this image image_boxes = list() image_labels = list() image_scores = list() max_scores, best_label = predicted_scores[i].max(dim=1) # (8732) # Check for each class for c in range(1, self.n_classes): # Keep only predicted boxes and scores where scores for this class are above the minimum score class_scores = predicted_scores[i][:, c] # (8732) score_above_min_score = class_scores > min_score # torch.uint8 (byte) tensor, for indexing n_above_min_score = score_above_min_score.sum().item() if n_above_min_score == 0: continue class_scores = class_scores[ score_above_min_score] # (n_qualified), n_min_score <= 8732 class_decoded_locs = decoded_locs[ score_above_min_score] # (n_qualified, 4) # Sort predicted boxes and scores by scores class_scores, sort_ind = class_scores.sort( dim=0, descending=True) # (n_qualified), (n_min_score) class_decoded_locs = class_decoded_locs[ sort_ind] # (n_min_score, 4) # Find the overlap between predicted boxes overlap = find_jaccard_overlap( class_decoded_locs, class_decoded_locs) # (n_qualified, n_min_score) # Non-Maximum Suppression (NMS) # A torch.uint8 (byte) tensor to keep track of which predicted boxes to suppress # 1 implies suppress, 0 implies don't suppress suppress = torch.zeros( (n_above_min_score), dtype=torch.uint8).to(device) # (n_qualified) # Consider each box in order of decreasing scores for box in range(class_decoded_locs.size(0)): # If this box is already marked for suppression if suppress[box] == 1: continue # Suppress boxes whose overlaps (with this box) are greater than maximum overlap # Find such boxes and update suppress indices suppress = torch.max(suppress, overlap[box] > max_overlap) # The max operation retains previously suppressed boxes, like an 'OR' operation # Don't suppress this box, even though it has an overlap of 1 with itself suppress[box] = 0 # Store only unsuppressed boxes for this class image_boxes.append(class_decoded_locs[1 - suppress]) image_labels.append( torch.LongTensor( (1 - suppress).sum().item() * [c]).to(device)) image_scores.append(class_scores[1 - suppress]) # If no object in any class is found, store a placeholder for 'background' if len(image_boxes) == 0: image_boxes.append( torch.FloatTensor([[0., 0., 1., 1.]]).to(device)) image_labels.append(torch.LongTensor([0]).to(device)) image_scores.append(torch.FloatTensor([0.]).to(device)) # Concatenate into single tensors image_boxes = torch.cat(image_boxes, dim=0) # (n_objects, 4) image_labels = torch.cat(image_labels, dim=0) # (n_objects) image_scores = torch.cat(image_scores, dim=0) # (n_objects) n_objects = image_scores.size(0) # Keep only the top k objects if n_objects > top_k: image_scores, sort_ind = image_scores.sort(dim=0, descending=True) image_scores = image_scores[:top_k] # (top_k) image_boxes = image_boxes[sort_ind][:top_k] # (top_k, 4) image_labels = image_labels[sort_ind][:top_k] # (top_k) # Append to lists that store predicted boxes and scores for all images all_images_boxes.append(image_boxes) all_images_labels.append(image_labels) all_images_scores.append(image_scores) return all_images_boxes, all_images_labels, all_images_scores # lists of length batch_size
def my_post_process_deprecated(self, predicted_offsets, predicted_scores, score_threshold, iou_threshold): ''' This approach based on my intuition that the box's class label should be the argmax of the softmax output, with this approach, score_threshold is not actually used properly since max of the softmax output is usually > 0.3 And of course, this doesn't work well as the model's output is more biased toward class background, bc neg_pos_ratio=3 So in many cases, the score for backdground overwhelm other classes like for example: (0.55, 0.01, 0.45, 0.04, 0.0,...), The result is that the recall is very low, it can't detect all the objects, however, precision is quite high, like probably about >95%. But still, APs and mAP is low in general. Params: predicted_offsets: predicted offsets w.r.t the 8732 prior boxes, (gcxgcy), a tensor of dimensions (N, 8732, 4) predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) score_threshold: minimum threshold for a box to be considered a match for a certain class iou_threshold: maximum overlap two boxes can have so that the one with the lower score is not suppressed via NMS Return: detections: (boxes, labels, and scores), they are lists of N tensors boxes: N (n_boxes, 4) labels: N (n_boxes,) scores: N (n_boxes,) ''' boxes = list() labels = list() scores = list() N, n_priors = predicted_offsets.shape[0:2] predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes) # for each box, find the largest score and the class_id with respect to it class_scores, class_ids = predicted_scores.max(dim=2) # (N, 8732) and (N, 8732) # for each image in the batch for i in range(N): boxes_i = list() labels_i = list() scores_i = list() # filter out boxes that are not qualified, that were predicted as background or with low confidence score qualify_mask = (class_ids[i] != 0) & (class_scores[i] > score_threshold) # (8732) qualified_boxes = predicted_offsets[i][qualify_mask] # (n_qualified_boxes, 4) qualified_boxes_class = class_ids[i][qualify_mask] # (n_qualified_boxes) qualified_boxes_score = class_scores[i][qualify_mask] # (n_qualified_boxes) if len(qualified_boxes) > 0: # convert to xy coordinates format qualified_boxes = cxcy_to_xy(gcxgcy_to_cxcy(qualified_boxes, self.priors_cxcy[qualify_mask])) # (n_qualified_boxes, 4) # Non-max suppression for class_i in qualified_boxes_class.unique(sorted=False).tolist(): class_mask = qualified_boxes_class == class_i boxes_class_i = qualified_boxes[class_mask] boxes_score_class_i = qualified_boxes_score[class_mask] final_box_ids = nms(boxes_class_i, boxes_score_class_i, iou_threshold) # (n_final_boxes,) boxes_i.extend(boxes_class_i[final_box_ids].tolist()) labels_i.extend([class_i]*len(final_box_ids)) scores_i.extend(boxes_score_class_i[final_box_ids].tolist()) boxes.append(torch.FloatTensor(boxes_i).to(device)) labels.append(torch.LongTensor(labels_i).to(device)) scores.append(torch.FloatTensor(scores_i).to(device)) return boxes, labels, scores
def detect(self, locs_pred, cls_pred, min_score, max_overlap, top_k): ''' Detect objects, perform NMS on boxes that are above a minimum threshold. locs_pred: Pred location, a tensor of dimensions (N, 8732, 4) cls_pred: Pred class scores for each of the encoded boxes, a tensor fo dimensions (N, 8732, n_classes) min_score: min threshold max_overlap: maximum overlap two boxes can have top_k: if there are lot of resulting detection across all classes, keep only the top 'k' Out: detection list: boxes, labels, score ''' batch_size = locs_pred.size(0) #N n_default_boxes = self.default_boxes.size(0) #8732 cls_pred = F.softmax(cls_pred, dim= 2) #(N, 8732, n_classes) assert n_default_boxes == locs_pred.size(1) == cls_pred.size(1) all_images_boxes = [] all_images_labels = [] all_images_scores = [] for i in range(batch_size): #Decode object decoded_locs = cxcy_to_xy(decode_bboxes(locs_pred[i], self.default_boxes)) #(8732, 4) image_boxes = [] image_labels = [] image_scores = [] max_scores, best_label = cls_pred[i].max(dim= 1) #(8732) #Check for each class for c in range(1, self.num_classes): class_scores = cls_pred[i][:, c] #8732 score_above_min_score = class_scores > min_score n_above_min_score = score_above_min_score.sum().item() if n_above_min_score == 0: continue class_scores = class_scores[score_above_min_score] # <=8732 class_decoded_locs = decoded_locs[score_above_min_score] # <=8732 #Sort pred boxes and socores by scores class_scores, sort_id = class_scores.sort(dim= 0, descending= True) class_decoded_locs = class_decoded_locs[sort_id] #Find overlap between pred locs overlap = find_IoU(class_decoded_locs, class_decoded_locs) #Apply NMS suppress = torch.zeros((n_above_min_score), dtype=torch.uint8).to(device) for box_id in range(class_decoded_locs.size(0)): if suppress[box_id] == 1: continue condition = overlap[box_id] > max_overlap condition = torch.tensor(condition, dtype=torch.uint8).to(device) suppress = torch.max(suppress, condition) suppress[box_id] = 0 # Store only unsuppressed boxes for this class image_boxes.append(class_decoded_locs[1 - suppress]) image_labels.append(torch.LongTensor((1 - suppress).sum().item() * [c]).to(device)) image_scores.append(class_scores[1 - suppress]) if len(image_boxes) == 0: image_boxes.append(torch.FloatTensor([[0., 0., 1., 1.]]).to(device)) image_labels.append(torch.LongTensor([0]).to(device)) image_scores.append(torch.FloatTensor([0.]).to(device)) #Concat into single tensors image_boxes = torch.cat(image_boxes, dim= 0) #(n_objects, 4) image_labels = torch.cat(image_labels, dim=0) # (n_objects) image_scores = torch.cat(image_scores, dim=0) # (n_objects) n_objects = image_scores.size(0) #Keep only the top k objects if n_objects > top_k: image_scores, sort_index = image_scores.sort(dim=0, descending=True) image_scores = image_scores[:top_k] # (top_k) image_boxes = image_boxes[sort_index][:top_k] # (top_k, 4) image_labels = image_labels[sort_index][:top_k] # (top_k) all_images_boxes.append(image_boxes) all_images_labels.append(image_labels) all_images_scores.append(image_scores) return all_images_boxes, all_images_labels, all_images_scores
def forward(self, locs_pred, cls_pred, boxes, labels): ''' Forward propagation locs_pred: Pred location, a tensor of dimensions (N, 8732, 4) cls_pred: Pred class scores for each of the encoded boxes, a tensor fo dimensions (N, 8732, n_classes) boxes: True object bouding boxes, a list of N tensors labels: True object labels, a list of N tensors Out: Mutilbox loss ''' batch_size = locs_pred.size(0) #N n_default_boxes = self.default_boxes.size(0) #8732 num_classes = cls_pred.size(2) #num_classes t_locs = torch.zeros((batch_size, n_default_boxes, 4), dtype=torch.float).to(device) #(N, 8732, 4) t_classes = torch.zeros((batch_size, n_default_boxes), dtype=torch.long).to(device) #(N, 8732) default_boxes_xy = cxcy_to_xy(self.default_boxes) for i in range(batch_size): n_objects = boxes[i].size(0) overlap = find_IoU(boxes[i], default_boxes_xy) #(n_objects, 8732) #for each default box, find the object has maximum overlap overlap_each_default_box, object_each_default_box = overlap.max( dim=0) #(8732) #find default box has maximum oberlap for each object _, default_boxes_each_object = overlap.max(dim=1) object_each_default_box[ default_boxes_each_object] = torch.LongTensor( range(n_objects)).to(device) overlap_each_default_box[default_boxes_each_object] = 1. #Labels for each default box label_each_default_box = labels[i][ object_each_default_box] #(8732) label_each_default_box[ overlap_each_default_box < self.threshold] = 0 #(8732) #Save t_classes[i] = label_each_default_box #Encode pred bboxes t_locs[i] = encode_bboxes( xy_to_cxcy(boxes[i][object_each_default_box]), self.default_boxes) #(8732, 4) # Identify priors that are positive pos_default_boxes = t_classes != 0 #(N, 8732) #Localization loss #Localization loss is computed only over positive default boxes smooth_L1_loss = nn.SmoothL1Loss() loc_loss = smooth_L1_loss(locs_pred[pos_default_boxes], t_locs[pos_default_boxes]) #Confidence loss #Apply hard negative mining #number of positive ad hard-negative default boxes per image n_positive = pos_default_boxes.sum(dim=1) n_hard_negatives = self.neg_pos * n_positive #Find the loss for all priors cross_entropy_loss = nn.CrossEntropyLoss(reduce=False) confidence_loss_all = cross_entropy_loss(cls_pred.view( -1, num_classes), t_classes.view(-1)) #(N*8732) confidence_loss_all = confidence_loss_all.view( batch_size, n_default_boxes) #(N, 8732) confidence_pos_loss = confidence_loss_all[pos_default_boxes] #Find which priors are hard-negative confidence_neg_loss = confidence_loss_all.clone() #(N, 8732) confidence_neg_loss[pos_default_boxes] = 0. confidence_neg_loss, _ = confidence_neg_loss.sort(dim=1, descending=True) hardness_ranks = torch.LongTensor(range(n_default_boxes)).unsqueeze( 0).expand_as(confidence_neg_loss).to(device) # (N, 8732) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze( 1) # (N, 8732) confidence_hard_neg_loss = confidence_neg_loss[hard_negatives] confidence_loss = ( confidence_hard_neg_loss.sum() + confidence_pos_loss.sum()) / n_positive.sum().float() return self.alpha * loc_loss + confidence_loss
def detect_objects(self, predicted_locs, predicted_scores, min_score, max_overlap, top_k): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) predicted_scores = F.softmax(predicted_scores, dim=2) all_images_boxes = list() all_images_labels = list() all_images_scores = list() assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) for i in range(batch_size): # Decode object coordinates from the form we regressed predicted boxes to decoded_locs = cxcy_to_xy( gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy) ) # (n_priors, 4), these are fractional pt. coordinates image_boxes = list() image_labels = list() image_scores = list() max_scores, best_label = predicted_scores[i].max(dim=1) # (8732) for c in range(1, self.n_classes): # Keep only predicted boxes and scores where scores for this class are above the minimum score class_scores = predicted_scores[i][:, c] score_above_min_score = class_scores > min_score # torch.uint8 (byte) tensor, for indexing n_above_min_score = score_above_min_score.sum().item() if n_above_min_score == 0: continue class_scores = class_scores[score_above_min_score] class_decoded_locs = decoded_locs[score_above_min_score] # Sort predicted boxes and scores by scores class_scores, sort_ind = class_scores.sort(dim=0, descending=True) class_decoded_locs = class_decoded_locs[sort_ind] print('class_scores.shape', class_scores.shape, 'class_decoded_locs.shape', class_decoded_locs.shape) # Find the overlap between predicted boxes overlap = find_jaccard_overlap(class_decoded_locs, class_decoded_locs) # Non-Maximum Suppression (NMS) # A torch.uint8 (byte) tensor to keep track of which predicted boxes to suppress # 1 implies suppress, 0 implies don't suppress suppress = torch.max( suppress, (overlap[box] > max_overlap).type( torch.cuda.ByteTensor)) # (n_qualified) # Consider each box in order of decreasing scores for box in range(class_decoded_locs.size(0)): # If this box is already marked for suppression if suppress[box] == 1: continue # Suppress boxes whose overlaps (with this box) are greater than maximum overlap # Find such boxes and update suppress indices suppress = torch.max(suppress, overlap[box] > max_overlap) # The max operation retains previously suppressed boxes, like an 'OR' operation # Don't suppress this box, even though it has an overlap of 1 with itself suppress[box] = 0 # Store only unsuppressed boxes for this class image_boxes.append(class_decoded_locs[1 - suppress]) image_labels.append( torch.LongTensor( (1 - suppress).sum().item() * [c]).to(device)) image_scores.append(class_scores[1 - suppress]) # If no object in any class is found, store a placeholder for 'background' if len(image_boxes) == 0: image_boxes.append( torch.FloatTensor([[0., 0., 1., 1.]]).to(device)) image_labels.append(torch.LongTensor([0]).to(device)) image_scores.append(torch.FloatTensor([0.]).to(device)) # Concatenate into single tensors image_boxes = torch.cat(image_boxes, dim=0) # (n_objects, 4) image_labels = torch.cat(image_labels, dim=0) # (n_objects) image_scores = torch.cat(image_scores, dim=0) # (n_objects) n_objects = image_scores.size(0) # Keep only the top k objects if n_objects > top_k: image_scores, sort_ind = image_scores.sort(dim=0, descending=True) image_scores = image_scores[:top_k] # (top_k) image_boxes = image_boxes[sort_ind][:top_k] # (top_k, 4) image_labels = image_labels[sort_ind][:top_k] # (top_k) # Append to lists that store predicted boxes and scores for all images all_images_boxes.append(image_boxes) all_images_labels.append(image_labels) all_images_scores.append(image_scores) return all_images_boxes, all_images_labels, all_images_scores # lists of length batch_size