def forward_rcnn_batch(self, base_feat, branch, rois, wgt_boxes, wnum_boxes, gt_boxes, num_boxes, im_info, image_classes, output_refine=False): batch_size = base_feat.size(0) # if it is training phrase, then use ground truth bboxes for refining if self.training: roi_data = self.RCNN_proposal_target( rois, wgt_boxes, wnum_boxes, gt_boxes, num_boxes) out_rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: out_rois = rois rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None out_rois = Variable(out_rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen( out_rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop( base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, out_rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, out_rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat, branch) # compute bbox offset bbox_pred = branch.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view( bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view( rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = branch.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss( bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) # add image-level label regularization rois_batch_size = out_rois.size(1) rois_prob = F.softmax(cls_score, 1).view(batch_size, rois_batch_size, -1) valid_rois_prob = (rois_label > 0).view(batch_size, rois_batch_size, -1).float() rois_attention = F.softmax(cls_score, 1).view(batch_size, rois_batch_size, -1) rois_attention = rois_attention * valid_rois_prob # ignore background rois_prob = rois_prob[:, :, 1:] rois_attention = rois_attention[:, :, 1:] # rois_attention_prob = torch.sum(rois_prob * rois_attention, dim=1) / (torch.sum(rois_attention, dim=1) + 1e-10) rois_attention_prob, _ = torch.max(rois_prob, dim=1) image_loss_cls = F.binary_cross_entropy(rois_attention_prob, image_classes[:, 1:]) else: image_loss_cls = None if self.training: cls_prob = cls_prob.view(batch_size, out_rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, out_rois.size(1), -1) else: cls_prob = cls_prob.view(1, out_rois.size(1), -1) bbox_pred = bbox_pred.view(1, out_rois.size(1), -1) if self.training and output_refine: # get transformation for wgt_boxes wgt_rois = wgt_boxes.new(wgt_boxes.size()).zero_() wgt_rois[:, :, 1:5] = wgt_boxes[:, :, :4] batch_size = base_feat.size(0) for i in range(batch_size): wgt_rois[:, :, 0] = i # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen( wgt_rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() gt_pooled_feat = self.RCNN_roi_crop( base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: gt_pooled_feat = F.max_pool2d(gt_pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': gt_pooled_feat = self.RCNN_roi_align( base_feat, wgt_rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': gt_pooled_feat = self.RCNN_roi_pool( base_feat, wgt_rois.view(-1, 5)) # feed pooled features to top model gt_pooled_feat = self._head_to_tail(gt_pooled_feat, branch) # compute bbox offset wgt_bbox_delta = branch.RCNN_bbox_pred(gt_pooled_feat) wgt_bbox_delta = wgt_bbox_delta.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() wgt_bbox_delta = wgt_bbox_delta.view(batch_size, -1, 4 * 21) wgt_bbox_out_rois = bbox_transform_inv( wgt_boxes, wgt_bbox_delta, batch_size) wgt_bbox_out_rois = clip_boxes( wgt_bbox_out_rois, im_info.data, batch_size) wgt_bbox_out = wgt_boxes.new(wgt_boxes.size()).zero_() wgt_cls = Variable( wgt_boxes[:, :, 4].data, requires_grad=False).long() for i in range(batch_size): for j in range(20): cls_ind = wgt_cls[i, j] wgt_bbox_out[i, j, :4] = wgt_bbox_out_rois[i, j, cls_ind * 4:cls_ind * 4 + 4] wgt_bbox_out[:, :, 4] = wgt_boxes[:, :, 4] wgt_boxes_x = (wgt_boxes[:, :, 2] - wgt_boxes[:, :, 0] + 1) wgt_boxes_y = (wgt_boxes[:, :, 3] - wgt_boxes[:, :, 1] + 1) wgt_area_zero = (wgt_boxes_x == 1) & (wgt_boxes_y == 1) wgt_bbox_out.masked_fill_(wgt_area_zero.view( batch_size, wgt_area_zero.size(1), 1).expand(wgt_boxes.size()), 0) wgt_bbox_out = wgt_bbox_out.detach() else: wgt_bbox_out = None return (out_rois, cls_prob, bbox_pred, RCNN_loss_cls, RCNN_loss_bbox, rois_label, image_loss_cls), wgt_bbox_out
if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= data[1][0][2].item() scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im = cv2.imread(imdb.image_path_at(i)) im2show = np.copy(im) for j in xrange(1, imdb.num_classes):
def inference(_test_img_path, _check_point, _score_threshold=0.3, class_agnostic=False): test_img_path = _test_img_path check_point = _check_point score_threshold = _score_threshold device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu") fasterRCNN = resnet(cfg.backbone, is_training=False, pretrained=False, class_agnostic=class_agnostic) fasterRCNN.create_architecture() print("load checkpoint %s" % (check_point)) checkpoint = torch.load(check_point) fasterRCNN.load_state_dict(checkpoint['model_state_dict']) print('load model successfully!') fasterRCNN.eval() fasterRCNN.to(device) im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) im_data = im_data.cuda() im_info = im_data.cuda() start_time = time.time() test_img = cv2.imread(test_img_path) test_img_copy = copy.deepcopy(test_img) test_img_copy, scale = image_preprocess(test_img_copy) test_img_copy = torch.from_numpy(test_img_copy) im_info_tensor = torch.Tensor( [[[test_img_copy.size(2), test_img_copy.size(3)]]]) im_data.resize_(test_img_copy.shape).copy_(test_img_copy) im_info.resize_(im_info_tensor.shape).copy_(im_info_tensor) rois, cls_prob, bbox_pred, _, _, _, _, _ = fasterRCNN(im_data, None) #without gt scores = cls_prob.data boxes = rois.data[:, :, 1:5] box_deltas = bbox_pred.data if cfg.bbox_normalize_targets_precomputed: if class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.bbox_normalize_std).cuda() \ + torch.FloatTensor(cfg.bbox_normalize_means).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.bbox_normalize_std).cuda() \ + torch.FloatTensor(cfg.bbox_normalize_means).cuda() print(box_deltas.size()) box_deltas = box_deltas.view(1, -1, 4 * len(cfg.class_to_ind)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, (im_data.size(2), im_data.size(3)), 1) pred_boxes = pred_boxes / scale scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() for j in range(1, len(cfg.class_to_ind)): inds = torch.nonzero(scores[:, j] > score_threshold).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = cls_boxes[order] cls_scores = cls_scores[order] keep = nms(cls_dets, cls_scores, cfg.test_nms_threshold) cls_dets = cls_dets[keep.view(-1).long()] #当前类别保留下来的目标框 cls_scores = cls_scores[keep.view(-1).long()] test_img = draw_target(test_img, cls_dets, cls_scores, j) end_time = time.time() print('detect time:{}s'.format(end_time - start_time)) cv2.imshow('result', test_img) cv2.waitKey(0)
def forward(self, input): # 按照通道C取出RPN预测的框属于前景的分数,请注意,在_num_anchors*2个channel中, # 前_num_anchors个是框属于背景的概率,后_num_anchors个才是属于前景的概率 scores = input[0][:, self._num_anchors:, :, :] bbox_deltas = input[1] im_info = input[2] is_training = input[3] if is_training: pre_nms_topN = cfg.train_rpn_pre_nms_top_N post_nms_topN = cfg.train_rpn_post_nms_top_N nms_thresh = cfg.rpn_nms_thresh else: pre_nms_topN = cfg.test_rpn_post_nms_top_N post_nms_topN = cfg.test_rpn_post_nms_top_N nms_thresh = cfg.rpn_nms_thresh batch_size = bbox_deltas.size(0) feat_height, feat_width = scores.size(2), scores.size(3) shift_x = np.arange(0, feat_width) * self._feat_stride shift_y = np.arange(0, feat_height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = torch.from_numpy(np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(scores).float() A = self._num_anchors K = shifts.size(0) self._anchors = self._anchors.type_as(scores) anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() bbox_deltas = bbox_deltas.view(batch_size, -1, 4) scores = scores.permute(0, 2, 3, 1).contiguous() scores = scores.view(batch_size, -1) proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) print(im_info) proposals = clip_boxes(proposals, im_info, batch_size) #将超出范围的候选框给夹紧使其不超过图像范围 scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] #选取最高的前pre_nms_topN个 proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(proposals_single, scores_single.squeeze(1), nms_thresh) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] num_proposal = proposals_single.size(0) output[i, :, 0] = i #属于哪个batch output[i, :num_proposal, 1:] = proposals_single #候选框坐标 return output
def evalue(check_point, cache_path='./result.pkl', class_agnostic=False, ovthresh=0.5, use_07_metric=False): ind_class = {v: k for k, v in cfg.class_to_ind.items()} class_result_dic = {k: [] for k in cfg.class_to_ind.keys() } # store every class result imagenames = [] if not os.path.exists(cache_path): test_set = PASCAL_VOC(cfg.testset_root_path, 'test') dataloader = DataLoader(test_set, batch_size=cfg.batch_size, shuffle=True, num_workers=4) device = torch.device( "cuda: 0" if torch.cuda.is_available() else "cpu") fasterRCNN = resnet(cfg.backbone, is_training=False, pretrained=False, class_agnostic=class_agnostic) fasterRCNN.create_architecture() print("load checkpoint %s" % (check_point)) checkpoint = torch.load(check_point) fasterRCNN.load_state_dict(checkpoint['model_state_dict']) print('load model successfully!') fasterRCNN.eval() fasterRCNN.to(device) im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) gt_boxes = torch.FloatTensor(1) im_data = im_data.cuda() im_info = im_info.cuda() gt_boxes = gt_boxes.cuda() #detect for result for batch_data in tqdm(dataloader): # batch_data = dataloader.next() with torch.no_grad(): im_data.resize_(batch_data['image'].size()).copy_( batch_data['image']) gt_boxes.resize_(batch_data['gt_boxes'].size()).copy_( batch_data['gt_boxes']) im_info.resize_(batch_data['im_info'].size()).copy_( batch_data['im_info']) image_name = os.path.basename( batch_data['imname'][0]).split('.')[0] imagenames.append(image_name) rois, cls_prob, bbox_pred, _, _, _, _, _ = fasterRCNN( im_data, gt_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] box_deltas = bbox_pred.data if cfg.bbox_normalize_targets_precomputed: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.bbox_normalize_std).cuda() \ + torch.FloatTensor(cfg.bbox_normalize_means).cuda() box_deltas = box_deltas.view(1, -1, 4) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info, 1) pred_boxes = pred_boxes / batch_data['im_info'][0, 2] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() for j in range(1, len(cfg.class_to_ind)): inds = torch.nonzero(scores[:, j] > 0).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = pred_boxes[order] cls_scores = cls_scores[order] keep = nms(cls_dets, cls_scores, cfg.test_nms_threshold) cls_dets = cls_dets[keep.view( -1).long()] # 当前类别保留下来的目标框 cls_scores = cls_scores[keep.view(-1).long()] for score, bbox in zip(cls_scores, cls_dets): class_result_dic[ind_class[j]].append({ 'image_name': image_name, 'score': score, 'bbox': [bbox[0], bbox[1], bbox[2], bbox[3]] }) print('writting result cache ......') with open(cache_path, 'wb') as fp: pickle.dump(class_result_dic, fp) else: with open( os.path.join(cfg.testset_root_path, 'ImageSets', 'Main', 'test.txt')) as fp: for line in fp: imagenames.append(line.strip()) with open(cache_path, 'rb') as fp: class_result_dic = pickle.load(fp) print('computer mAP... ') # computer map recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec( os.path.join(cfg.testset_root_path, 'Annotations', imagename + '.xml')) # extract gt objects for this class mAP = 0 for classname in cfg.class_to_ind.keys(): if classname == 'BG': continue print(classname, end=' ') class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = { 'bbox': bbox, 'difficult': difficult, 'det': det } class_result = class_result_dic[classname] image_ids = [r['image_name'] for r in class_result] confidence = np.array([float(r['score']) for r in class_result]) BB = np.array([r['bbox'] for r in class_result]) # sort by confidence sorted_ind = np.argsort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(rec, prec, use_07_metric) print(ap) mAP += ap mAP = mAP / (len(cfg.class_to_ind) - 1) print('mAP:', mAP)