def forward(self, loc_data, conf_data, priors): # loc_data = prediction[:,:,:4] # conf_data = prediction[:,:,4:] num_priors = priors.shape[0] batch_size = loc_data.shape[0] output = np.zeros(shape=(batch_size, self.num_classes, self.top_k, 5), dtype=np.float32) conf_preds = conf_data.swapaxes(2, 1) for i in range(batch_size): decoded_boxes = decode(loc=loc_data[i], priors=priors, variances=self.variances) conf_scores = conf_preds[i].copy() for cl in range(1, self.num_classes): c_mask = np.greater(conf_scores[cl], self.conf_thresh) scores = conf_scores[cl][c_mask] scores = np.float32(scores) if scores.shape[0] == 0: continue l_mask = c_mask.reshape(-1, 1).repeat(4, axis=-1) boxes = decoded_boxes[l_mask].reshape(-1, 4).astype(np.float32) # print(boxes.shape) boxes = torch.from_numpy(boxes).float() scores = torch.from_numpy(scores).float() ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) # ids, count = non_maximum_supression(boxes = boxes, # scores = scores, # overlap = self.nms_thresh, # top_k = self.top_k) ## # print(ids.shape) # print(count) ids = np.int32(ids) count = np.int32(count) scores = scores[ids[:count]] scores = np.expand_dims(scores, axis=1) output[i, cl, :count] = np.concatenate( (scores, boxes[ids[:count]]), axis=-1) # flt = output.ascontiguousarray().reshape(batch_size, -1, 5) # idx = np.argsort(flt[:,:,0], axis=-1) # rank = np.argsort(idx, axis=-1) # flt[rank < self.top_k].ex return output
def run_first_stage(image, net, scale, threshold, gpu_id=0): """Run P-Net, generate bounding boxes, and do NMS. Arguments: image: an instance of PIL.Image. net: an instance of pytorch's nn.Module, P-Net. scale: a float number, scale width and height of the image by this number. threshold: a float number, threshold on the probability of a face when generating bounding boxes from predictions of the net. Returns: a float numpy array of shape [n_boxes, 9], bounding boxes with scores and offsets (4 + 1 + 4). """ # scale the image and convert it to a float array width, height = image.size sw, sh = math.ceil(width * scale), math.ceil(height * scale) img = image.resize((sw, sh), Image.BILINEAR) img = np.asarray(img, 'float32') img = torch.FloatTensor(_preprocess(img)).to('cuda:%d' % gpu_id) output = net(img) probs = output[1].cpu().data.numpy()[0, 1, :, :] offsets = output[0].cpu().data.numpy() # probs: probability of a face at each sliding window # offsets: transformations to true bounding boxes boxes = _generate_bboxes(probs, offsets, scale, threshold) if len(boxes) == 0: return None keep = nms(boxes[:, 0:5], overlap_threshold=0.5) return boxes[keep]
def dofilter(frames, action_index, frame_index, nms_thresh): # filter out least likely detections for actions scores = frames[frame_index]['scores'][:, action_index] pick = np.where(scores > 0.001) scores = scores[pick] boxes = frames[frame_index]['boxes'][pick, :].squeeze(0) allscores = frames[frame_index]['scores'][pick, :].squeeze(0) # sort in descending order pick = np.argsort(scores)[::-1] # pick at most 50 to_pick = min(50, len(pick)) pick = pick[:to_pick] scores = scores[pick] boxes = boxes[pick, :] allscores = allscores[pick, :] # Perform nms on picked boxes dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32) if len(boxes) == 0 or len(scores) == 0 or len(allscores) == 0: return boxes, scores, allscores pick, counts = nms(torch.from_numpy(boxes), torch.from_numpy(scores), nms_thresh) # idsn - ids after nms pick = pick[:counts] #pick = nms(dets, nms_thresh) pick = pick[:counts].cpu().numpy() boxes = boxes[pick, :] scores = scores[pick] allscores = allscores[pick, :] return boxes, scores, allscores
def get_nmsed_box(rpn_rois, confs, locs, class_nums, im_info): lod = rpn_rois.lod()[0] rpn_rois_v = np.array(rpn_rois) variance_v = np.array(cfg.bbox_reg_weights) confs_v = np.array(confs) locs_v = np.array(locs) im_results = [[] for _ in range(len(lod) - 1)] new_lod = [0] for i in range(len(lod) - 1): start = lod[i] end = lod[i + 1] if start == end: continue locs_n = locs_v[start:end, :] rois_n = rpn_rois_v[start:end, :] rois_n = rois_n / im_info[i][2] rois_n = box_decoder(locs_n, rois_n, variance_v) rois_n = clip_tiled_boxes(rois_n, im_info[i][:2] / im_info[i][2]) cls_boxes = [[] for _ in range(class_nums)] scores_n = confs_v[start:end, :] for j in range(1, class_nums): inds = np.where(scores_n[:, j] > cfg.TEST.score_thresh)[0] scores_j = scores_n[inds, j] rois_j = rois_n[inds, j * 4:(j + 1) * 4] dets_j = np.hstack( (scores_j[:, np.newaxis], rois_j)).astype(np.float32, copy=False) keep = box_utils.nms(dets_j, cfg.TEST.nms_thresh) nms_dets = dets_j[keep, :] #add labels label = np.array([j for _ in range(len(keep))]) nms_dets = np.hstack( (nms_dets, label[:, np.newaxis])).astype(np.float32, copy=False) cls_boxes[j] = nms_dets # Limit to max_per_image detections **over all classes** image_scores = np.hstack( [cls_boxes[j][:, 1] for j in range(1, class_nums)]) if len(image_scores) > cfg.TEST.detections_per_im: image_thresh = np.sort(image_scores)[-cfg.TEST.detections_per_im] for j in range(1, class_nums): keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0] cls_boxes[j] = cls_boxes[j][keep, :] im_results_n = np.vstack([cls_boxes[j] for j in range(1, class_nums)]) im_results[i] = im_results_n new_lod.append(len(im_results_n) + new_lod[-1]) boxes = im_results_n[:, 2:] scores = im_results_n[:, 1] labels = im_results_n[:, 0] im_results = np.vstack([im_results[k] for k in range(len(lod) - 1)]) return new_lod, im_results
def forward(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh, loc_data, conf_data, prior_data): """ Args: loc_data: (tensor) Loc preds from loc layers Shape: [batch,num_priors*4] conf_data: (tensor) Shape: Conf preds from conf layers Shape: [batch*num_priors,num_classes] prior_data: (tensor) Prior boxes and variances from priorbox layers Shape: [1,num_priors,4] """ self.num_classes = num_classes self.background_label = bkg_label self.top_k = top_k # Parameters used in nms. self.nms_thresh = nms_thresh if nms_thresh <= 0: raise ValueError('nms_threshold must be non negative.') self.conf_thresh = conf_thresh self.variance = cfg['variance'] num = loc_data.size(0) # batch size num_priors = prior_data.size(0) output = torch.zeros(num, self.num_classes, self.top_k, 5) conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) # Decode predictions into bboxes. for i in range(num): decoded_boxes = decode(loc_data[i], prior_data, self.variance) # For each class, perform nms conf_scores = conf_preds[i].clone() # num_det = 0 for cl in range(1, self.num_classes): c_mask = conf_scores[cl].gt(self.conf_thresh) scores = conf_scores[cl][c_mask] if scores.size(0) == 0: continue l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) boxes = decoded_boxes[l_mask].view(-1, 4) # idx of highest scoring and non-overlapping boxes per class ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) output[i, cl, :count] = \ torch.cat((scores[ids[:count]].unsqueeze(1), boxes[ids[:count]]), 1) flt = output.contiguous().view(num, -1, 5) _, idx = flt[:, :, 0].sort(1, descending=True) _, rank = idx.sort(1) flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) return output
def forward(self, loc_data, conf_data, prior_data): """ Args: loc_data: (tensor) Loc preds from loc layers Shape: [batch,num_priors*4] conf_data: (tensor) Shape: Conf preds from conf layers Shape: [batch*num_priors,num_classes] prior_data: (tensor) Prior boxes and variances from priorbox layers Shape: [1,num_priors,4] """ num = loc_data.size(0) # batch size num_priors = prior_data.size(0) # [バッチサイズN,クラス数21,トップ200件,確信度+位置]のゼロリストを作成 output = torch.zeros(num, self.num_classes, self.top_k, 5) # 確信度を[バッチサイズN,クラス数,ボックス数]の順番に変更 conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) # Decode predictions into bboxes. for i in range(num): decoded_boxes = decode(loc_data[i], prior_data, self.variance) # For each class, perform nms conf_scores = conf_preds[i].clone() for cl in range(1, self.num_classes): # 確信度の閾値を使ってボックスを削除 c_mask = conf_scores[cl].gt(self.conf_thresh) scores = conf_scores[cl][c_mask] # handbook if scores.size(0) == 0: # handbook continue l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) # ボックスのデコード処理 boxes = decoded_boxes[l_mask].view(-1, 4) # idx of highest scoring and non-overlapping boxes per class # boxesからNMSで重複するボックスを削除 ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) output[i, cl, :count] = \ torch.cat((scores[ids[:count]].unsqueeze(1), boxes[ids[:count]]), 1) flt = output.contiguous().view(num, -1, 5) _, idx = flt[:, :, 0].sort(1, descending=True) _, rank = idx.sort(1) flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) return output
def forward(self, loc_data, conf_data, prior_data, conf_thresh): """ Args: loc_data: (tensor) Loc preds from loc layers Shape: [batch,num_priors*4] conf_data: (tensor) Shape: Conf preds from conf layers Shape: [batch*num_priors,num_classes] prior_data: (tensor) Prior boxes and variances from priorbox layers Shape: [1,num_priors,4] """ batch_size = loc_data.size(0) num_priors = prior_data.size(0) output = torch.zeros(batch_size, self.num_classes, self.top_k, 5) if loc_data.is_cuda: output = output.cuda() conf_preds = conf_data.transpose(2, 1) # group by classes # Decode predictions into bboxes. for i in range(batch_size): decoded_boxes = decode(loc_data[i], prior_data, self.variance) # For each class, perform nms conf_scores = conf_preds[i].clone() for cl in range(self.num_classes): c_mask = conf_scores[cl].gt(conf_thresh) scores = conf_scores[cl][c_mask] if scores.dim() == 0: continue l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) thresholded_boxes = decoded_boxes[l_mask] if len(thresholded_boxes) > 0: boxes = thresholded_boxes.view(-1, 4) # idx of highest scoring and non-overlapping boxes per class ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) output[i, cl, :count] = \ torch.cat((scores[ids[:count]].unsqueeze(1), boxes[ids[:count]]), 1) flt = output.contiguous().view(batch_size, -1, 5) _, idx = flt[:, :, 0].sort(1, descending=True) _, rank = idx.sort(1) flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) return output
def predict(self, image, top_k=-1, prob_threshold=None): cpu_device = torch.device("cpu") height, width, _ = image.shape image = self.transform(image) images = image.unsqueeze(0) images = images.to(self.device) with torch.no_grad(): self.timer.start() scores, boxes = self.net.forward(images) print("Inference time: ", self.timer.end()) boxes = boxes[0] scores = scores[0] if not prob_threshold: prob_threshold = self.filter_threshold # this version of nms is slower on GPU, so we move data to CPU. boxes = boxes.to(cpu_device) scores = scores.to(cpu_device) picked_box_probs = [] picked_labels = [] for class_index in range(1, scores.size(1)): probs = scores[:, class_index] mask = probs > prob_threshold probs = probs[mask] if probs.size(0) == 0: continue subset_boxes = boxes[mask, :] box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1) box_probs = box_utils.nms(box_probs, self.nms_method, score_threshold=prob_threshold, iou_threshold=self.iou_threshold, sigma=self.sigma, top_k=top_k, candidate_size=self.candidate_size) picked_box_probs.append(box_probs) picked_labels.extend([class_index] * box_probs.size(0)) if not picked_box_probs: return torch.tensor([]), torch.tensor([]), torch.tensor([]) picked_box_probs = torch.cat(picked_box_probs) picked_box_probs[:, 0] *= width picked_box_probs[:, 1] *= height picked_box_probs[:, 2] *= width picked_box_probs[:, 3] *= height return picked_box_probs[:, :4], torch.tensor(picked_labels), picked_box_probs[:, 4]
def run_first_stage(image, net, scale, threshold): """Run P-Net, generate bounding boxes, and do NMS. Arguments: image: an instance of PIL.Image. net: an instance of pytorch's nn.Module, P-Net. scale: a float number, scale width and height of the image by this number. threshold: a float number, threshold on the probability of a face when generating bounding boxes from predictions of the net. Returns: a float numpy array of shape [n_boxes, 9], bounding boxes with scores and offsets (4 + 1 + 4). """ # scale the image and convert it to a float array width, height = image.size sw, sh = math.ceil(width*scale), math.ceil(height*scale) img = image.resize((sw, sh), Image.BILINEAR) img = np.asarray(img, 'float32') img = Variable(torch.FloatTensor(_preprocess(img)), volatile = True) output = net(img) probs = output[1].data.numpy()[0, 1, :, :] offsets = output[0].data.numpy() # probs: probability of a face at each sliding window # offsets: transformations to true bounding boxes boxes = _generate_bboxes(probs, offsets, scale, threshold) if len(boxes) == 0: return None keep = nms(boxes[:, 0:5], overlap_threshold = 0.5) return boxes[keep]
def detect_faces(image, min_face_size=20.0, thresholds=[0.6, 0.7, 0.8], nms_thresholds=[0.7, 0.7, 0.7]): """ Arguments: image: an instance of PIL.Image. min_face_size: a float number. thresholds: a list of length 3. nms_thresholds: a list of length 3. Returns: two float numpy arrays of shapes [n_boxes, 4] and [n_boxes, 10], bounding boxes and facial landmarks. """ with torch.no_grad(): # LOAD MODELS pnet = PNet().to(device) rnet = RNet().to(device) onet = ONet().to(device) onet.eval() # BUILD AN IMAGE PYRAMID width, height = image.size min_length = min(height, width) min_detection_size = 12 factor = 0.707 # sqrt(0.5) # scales for scaling the image scales = [] # scales the image so that # minimum size that we can detect equals to # minimum face size that we want to detect m = min_detection_size / min_face_size min_length *= m factor_count = 0 while min_length > min_detection_size: scales.append(m * factor ** factor_count) min_length *= factor factor_count += 1 # STAGE 1 # it will be returned bounding_boxes = [] # run P-Net on different scales for s in scales: boxes = run_first_stage(image, pnet, scale=s, threshold=thresholds[0]) bounding_boxes.append(boxes) # collect boxes (and offsets, and scores) from different scales bounding_boxes = [i for i in bounding_boxes if i is not None] bounding_boxes = np.vstack(bounding_boxes) keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0]) bounding_boxes = bounding_boxes[keep] # use offsets predicted by pnet to transform bounding boxes bounding_boxes = calibrate_box(bounding_boxes[:, 0:5], bounding_boxes[:, 5:]) # shape [n_boxes, 5] bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 2 img_boxes = get_image_boxes(bounding_boxes, image, size=24) img_boxes = Variable(torch.FloatTensor(img_boxes).to(device)) output = rnet(img_boxes) offsets = output[0].data.cpu().numpy() # shape [n_boxes, 4] probs = output[1].data.cpu().numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[1])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) offsets = offsets[keep] keep = nms(bounding_boxes, nms_thresholds[1]) bounding_boxes = bounding_boxes[keep] bounding_boxes = calibrate_box(bounding_boxes, offsets[keep]) bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 3 img_boxes = get_image_boxes(bounding_boxes, image, size=48) if len(img_boxes) == 0: return [], [] img_boxes = Variable(torch.FloatTensor(img_boxes).to(device)) output = onet(img_boxes) landmarks = output[0].data.cpu().numpy() # shape [n_boxes, 10] offsets = output[1].data.cpu().numpy() # shape [n_boxes, 4] probs = output[2].data.cpu().numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[2])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) offsets = offsets[keep] landmarks = landmarks[keep] # compute landmark points width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0 height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0 xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1] landmarks[:, 0:5] = np.expand_dims(xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5] landmarks[:, 5:10] = np.expand_dims(ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10] bounding_boxes = calibrate_box(bounding_boxes, offsets) keep = nms(bounding_boxes, nms_thresholds[2], mode='min') bounding_boxes = bounding_boxes[keep] landmarks = landmarks[keep] return bounding_boxes, landmarks
def detect_faces(image, min_face_size = 20.0, thresholds=[0.6, 0.7, 0.8], nms_thresholds=[0.7, 0.7, 0.7]): """ Arguments: image: an instance of PIL.Image. min_face_size: a float number. thresholds: a list of length 3. nms_thresholds: a list of length 3. Returns: two float numpy arrays of shapes [n_boxes, 4] and [n_boxes, 10], bounding boxes and facial landmarks. """ # LOAD MODELS pnet = PNet() rnet = RNet() onet = ONet() onet.eval() # BUILD AN IMAGE PYRAMID width, height = image.size min_length = min(height, width) min_detection_size = 12 factor = 0.707 # sqrt(0.5) # scales for scaling the image scales = [] # scales the image so that # minimum size that we can detect equals to # minimum face size that we want to detect m = min_detection_size/min_face_size min_length *= m factor_count = 0 while min_length > min_detection_size: scales.append(m*factor**factor_count) min_length *= factor factor_count += 1 # STAGE 1 # it will be returned bounding_boxes = [] # run P-Net on different scales for s in scales: boxes = run_first_stage(image, pnet, scale = s, threshold = thresholds[0]) bounding_boxes.append(boxes) # collect boxes (and offsets, and scores) from different scales bounding_boxes = [i for i in bounding_boxes if i is not None] bounding_boxes = np.vstack(bounding_boxes) keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0]) bounding_boxes = bounding_boxes[keep] # use offsets predicted by pnet to transform bounding boxes bounding_boxes = calibrate_box(bounding_boxes[:, 0:5], bounding_boxes[:, 5:]) # shape [n_boxes, 5] bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 2 img_boxes = get_image_boxes(bounding_boxes, image, size = 24) img_boxes = Variable(torch.FloatTensor(img_boxes), volatile = True) output = rnet(img_boxes) offsets = output[0].data.numpy() # shape [n_boxes, 4] probs = output[1].data.numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[1])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, )) offsets = offsets[keep] keep = nms(bounding_boxes, nms_thresholds[1]) bounding_boxes = bounding_boxes[keep] bounding_boxes = calibrate_box(bounding_boxes, offsets[keep]) bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 3 img_boxes = get_image_boxes(bounding_boxes, image, size = 48) if len(img_boxes) == 0: return [], [] img_boxes = Variable(torch.FloatTensor(img_boxes), volatile = True) output = onet(img_boxes) landmarks = output[0].data.numpy() # shape [n_boxes, 10] offsets = output[1].data.numpy() # shape [n_boxes, 4] probs = output[2].data.numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[2])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, )) offsets = offsets[keep] landmarks = landmarks[keep] # compute landmark points width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0 height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0 xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1] landmarks[:, 0:5] = np.expand_dims(xmin, 1) + np.expand_dims(width, 1)*landmarks[:, 0:5] landmarks[:, 5:10] = np.expand_dims(ymin, 1) + np.expand_dims(height, 1)*landmarks[:, 5:10] bounding_boxes = calibrate_box(bounding_boxes, offsets) keep = nms(bounding_boxes, nms_thresholds[2], mode = 'min') bounding_boxes = bounding_boxes[keep] landmarks = landmarks[keep] return bounding_boxes, landmarks
def _forward_test(self, input): cnn_features = input arg = easydict.EasyDict({ 'clip_boxes': self.test_clip_boxes, 'nms_thresh': self.test_nms_thresh, 'max_proposals': self.test_max_proposals }) # Make sure that setImageSize has been called assert self.image_height and self.image_width and not self._called_forward_size, \ 'Must call setImageSize before each forward pass' self._called_forward_size = True rpn_out, act_reg = self.rpn.forward(cnn_features) rpn_boxes, rpn_anchors, rpn_trans, rpn_scores = rpn_out num_boxes = rpn_boxes.size(1) # Maybe clip boxes to image boundary if arg.clip_boxes: bounds = { 'x_min': 1, 'y_min': 1, 'x_max': self.image_width, 'y_max': self.image_height } rpn_boxes, valid = box_utils.clip_boxes(rpn_boxes, bounds, 'xcycwh') #print(string.format('%d/%d boxes are predicted valid', # torch.sum(valid), valid:nElement())) #Clamp parallel arrays only to valid boxes (not oob of the image) rpn_boxes = self.clamp_data(rpn_boxes, valid) rpn_anchors = self.clamp_data(rpn_anchors, valid) rpn_trans = self.clamp_data(rpn_trans, valid) rpn_scores = self.clamp_data(rpn_scores, valid) num_boxes = rpn_boxes.size(1) # Convert rpn boxes from (xc, yc, w, h) format to (x1, y1, x2, y2) rpn_boxes_x1y1x2y2 = box_utils.xcycwh_to_x1y1x2y2(rpn_boxes[0]) # Convert objectness positive / negative scores to probabilities rpn_scores_exp = torch.exp(rpn_scores) pos_exp = rpn_scores_exp[0, :, 0] neg_exp = rpn_scores_exp[0, :, 1] scores = (pos_exp + neg_exp).pow(-1) * pos_exp verbose = False if verbose: print('in LocalizationLayer forward_test') print('Before NMS there are %d boxes' % num_boxes) print('Using NMS threshold %f' % arg.nms_thresh) #Run NMS and sort by objectness score boxes_scores = torch.cat((rpn_boxes_x1y1x2y2, scores.view(-1, 1)), dim=1) if arg.max_proposals == -1: idx = box_utils.nms(boxes_scores.data, arg.nms_thresh) else: idx = box_utils.nms(boxes_scores.data, arg.nms_thresh, arg.max_proposals) rpn_boxes_nms = torch.squeeze(rpn_boxes)[idx] if verbose: print('After NMS there are %d boxes' % rpn_boxes_nms.size(0)) output = rpn_boxes_nms return output
priors=priors) sample_np = data_np[0] images_np, targets_np = sample_np loc_data = targets_np[:, :, :4] conf_data = targets_np[:, :, 4:] a = loc_data[0, :, :] decoded_np = decode_np(loc=a, priors=priors, variances=[0.1, 0.2]) a_ = torch.from_numpy(a).float() priors_ = torch.from_numpy(priors).float() decoded_th = decode_th(loc=a_, priors=priors_, variances=[0.1, 0.2]) c = decoded_th.numpy() == decoded_np print(np.sum(c)) scores = np.random.rand(8732, ) scores_ = torch.from_numpy(scores).float() nms_np = non_maximum_supression(boxes=decoded_np, scores=scores, top_k=200, overlap=0.5) nms_th = nms(boxes=decoded_th, scores=scores_, overlap=0.5, top_k=200) print(nms_th[0].numpy()) print(nms_np[0])
def forward(self, arm_loc_data, arm_conf_data, odm_loc_data, odm_conf_data, prior_data): """ Args: loc_data: (tensor) Loc preds from loc layers Shape: [batch,num_priors*4] conf_data: (tensor) Shape: Conf preds from conf layers Shape: [batch*num_priors,num_classes] prior_data: (tensor) Prior boxes and variances from priorbox layers Shape: [num_priors,4] """ loc_data = odm_loc_data conf_data = F.softmax(odm_conf_data,dim=2) arm_conf_data = F.softmax(arm_conf_data,dim=2) arm_object_conf = arm_conf_data.data[:, :, 1:] no_object_index = arm_object_conf <= self.objectness_thre conf_data[no_object_index.expand_as(conf_data)] = 0 num = loc_data.size(0) # batch size num_priors = prior_data.size(0) output = torch.zeros(num, self.num_classes, self.top_k, 5) conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) #conf_preds = conf_data.view(num,num_priors,self.num_classes) # Decode predictions into bboxes. if torch.cuda.is_available(): prior_data.cuda() for i in range(num): default = decode(arm_loc_data[i], prior_data, self.variance) default = center_size(default) decoded_boxes = decode(loc_data[i], default, self.variance) # For each class, perform nms conf_scores = conf_preds[i].clone() ''' prior_conf_max,prior_conf_idx = conf_scores.max(1,keepdim=True) cls_mask = prior_conf_idx.gt(0) prior_conf_max = prior_conf_max[cls_mask] prior_conf_idx = prior_conf_idx[cls_mask] decoded_boxes = decoded_boxes[cls_mask] conf_mask = prior_conf_max.gt(self.conf_thresh) prior_conf_max = prior_conf_max[conf_mask] prior_conf_idx = prior_conf_idx[conf_mask] decoded_boxes = decoded_boxes[conf_mask] ''' #print(decoded_boxes, conf_scores) for cl in range(1, self.num_classes): c_mask = conf_scores[cl].gt(self.conf_thresh) scores = conf_scores[cl][c_mask] #print(scores.dim()) if scores.size(0) == 0: continue l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) boxes = decoded_boxes[l_mask].view(-1, 4) # idx of highest scoring and non-overlapping boxes per class #print(boxes.size(), scores.size()) ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) ids = torch.tensor(ids,dtype=torch.long) if count ==0: continue #print(count,ids[:count],torch.gather(scores,0,ids).data) #print(boxes[ids[:count]]) #print('debug',scores[ids[:count]].size(),boxes[ids[:count]].size()) output[i, cl, :count] = \ torch.cat((scores[ids[:count]].view(-1,1), boxes[ids[:count]].view(-1,4)), 1) #flt = output.contiguous().view(num, -1, 5) #_, idx = flt[:, :, 0].sort(1, descending=True) #_, rank = idx.sort(1) ############???????? #flt[(rank < self.keep_top_k).unsqueeze(-1).expand_as(flt)].fill_(0) #print('fit',output.size()) return output