def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (input_height, input_width). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious < 0.4] = 0 ignore = (max_ious > 0.4) & (max_ious < 0.5 ) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): if isinstance(input_size, int): input_size = torch.Tensor([input_size, input_size]) else: input_size = torch.Tensor(input_size) anchor_boxes = self.get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') boxes = boxes.float() ious = box_iou(anchor_boxes, boxes, order='xywh') #ious :每个候选框与多个目标框的ious 行:个数 列:分数 #max_ids 与候选框最相近的目标框索引号,max_ious 为与最相近目标框的候选框的ious分数 max_ious, max_ids = ious.max(1) #选出每个候选框对应的目标框 boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious < 0.1] = 0 cls_targets[(max_ious >= 0.1) & (max_ious < 0.3)] = -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax) in range [0,1], sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int) model input size. Returns: loc_targets: (tensor) encoded bounding boxes, sized [#total_anchors,4]. cls_targets: (tensor) encoded class labels, sized [#total_anchors]. ''' anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') boxes = boxes * input_size # scale to range [0,input_size] ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious < 0.4] = 0 ignore = (max_ious > 0.4) & (max_ious < 0.5 ) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): """We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) args: boxes:Tensor(xmin, ymin, xmax, ymax) size(boxes_num, 4) labels:Tensor size(boxes_num,) return: target_cls:Tensor(anchor_num,) target_loc:Tensor(anchor_num, 4) """ anchor_boxes = self._get_anchor_boxes(input_size) # [anchor_num, 4] boxes = utils.change_box_order(boxes, 'xyxy2xywh') ious = utils.box_iou(anchor_boxes, boxes, order='xywh') # [anchor_num, boxes_num] max_ious, max_ids = ious.max(1) # (anchor_num,) boxes = boxes[max_ids] # (anchor_num, 4), groundtruth loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) target_loc = torch.cat([loc_xy, loc_wh], 1) target_cls = labels[max_ids] target_cls[max_ious < 0.5] = 0 ignore = (max_ious < 0.5) & (max_ious >= 0.4) target_cls[ignore] = -1 return target_loc, target_cls
def estimateTrackedBbox(self, z): """ Handles all kinds of estimation: unmatched detections (new detection) unmatched tracking (no confident detections) matched detection (general tracking) """ z = np.expand_dims(z, axis=0).T if self.box == [] or self.x_state == []: # unmatched detection self.x_state = np.array([[z[0], 0, z[1], 0, z[2], 0, z[3], 0]]).T self.predict_only() self.missed_dets = 0 else: iou = box_iou(self.box, z) if iou > self.iou_thr: # matched tracking self.kalman_filter(z) self.missed_dets = 0 else: # unmatched tracking if self.missed_dets > self.max_age: return False self.missed_dets += 1 self.predict_only() xx = self.x_state.T[0].tolist() self.box = [xx[0], xx[2], xx[4], xx[6]] return True
def find_best_pred(gt_boxes, pred_boxes): ''' Find whether there is a predicted box for each ground box Args: gt_boxes: (FloatTensor) [N, 6] zyxzyx pred_boxes: (FloatTensor) [M, 6] zyxzyx Returns: count: (ndarray) (tp, fn, fp) ''' tp = 0 fn = 0 fp = 0 distance = box_distance(gt_boxes, pred_boxes) iou = box_iou(gt_boxes, pred_boxes) min_dists, min_ids = distance.min(1) best_ious, best_ids = iou.min(0) # find best gt for predict gt_boxes = change_box_order(gt_boxes, order="zyxzyx2zyxdhw") for i in range(gt_boxes.size(0)): gt = gt_boxes[i, :] diameter = math.sqrt(gt[3]**2 + gt[4]**2 + gt[5]**2) radius = diameter / 2 + 10. if min_dists[i] <= radius: tp += 1 else: fn += 1 fp = pred_boxes.size(0) - tp return np.array([tp, fn, fp]), best_ious
def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask
def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[i][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = tf.keras.backend.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, tf.cast(best_iou < ignore_thresh, true_box.dtype)) return b + 1, ignore_mask
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels into YOLOv2 format. Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax) in range [0,1], sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int) model input size. Returns: loc_targets: (tensor) encoded bounding boxes, sized [5,4,fmsize,fmsize]. cls_targets: (tensor) encoded class labels, sized [5,20,fmsize,fmsize]. box_targets: (tensor) truth boxes, sized [#obj,4]. ''' num_boxes = len(boxes) # input_size -> fmsize # 320->10, 352->11, 384->12, 416->13, ..., 608->19 fmsize = (input_size - 320) / 32 + 10 grid_size = input_size / fmsize boxes *= input_size # scale [0,1] -> [0,input_size] bx = (boxes[:, 0] + boxes[:, 2]) * 0.5 / grid_size # in [0,fmsize] by = (boxes[:, 1] + boxes[:, 3]) * 0.5 / grid_size # in [0,fmsize] bw = (boxes[:, 2] - boxes[:, 0]) / grid_size # in [0,fmsize] bh = (boxes[:, 3] - boxes[:, 1]) / grid_size # in [0,fmsize] tx = bx - bx.floor() ty = by - by.floor() xy = meshgrid(fmsize, swap_dims=True) + 0.5 # grid center, [fmsize*fmsize,2] wh = torch.Tensor(self.anchors) # [5,2] xy = xy.view(fmsize, fmsize, 1, 2).expand(fmsize, fmsize, 5, 2) wh = wh.view(1, 1, 5, 2).expand(fmsize, fmsize, 5, 2) anchor_boxes = torch.cat([xy - wh / 2, xy + wh / 2], 3) # [fmsize,fmsize,5,4] ious = box_iou(anchor_boxes.view(-1, 4), boxes / grid_size) # [fmsize*fmsize*5,N] ious = ious.view(fmsize, fmsize, 5, num_boxes) # [fmsize,fmsize,5,N] loc_targets = torch.zeros(5, 4, fmsize, fmsize) # 5boxes * 4coords cls_targets = torch.zeros(5, 20, fmsize, fmsize) for i in range(num_boxes): cx = int(bx[i]) cy = int(by[i]) _, max_idx = ious[cy, cx, :, i].max(0) j = max_idx[0] cls_targets[j, labels[i], cy, cx] = 1 tw = bw[i] / self.anchors[j][0] th = bh[i] / self.anchors[j][1] loc_targets[j, :, cy, cx] = torch.Tensor([tx[i], ty[i], tw, th]) return loc_targets, cls_targets, boxes / grid_size
def encode(self, gt_quad_boxes, labels, input_size): '''Encode target bounding boxes and class labels. TextBoxes++ quad_box encoder: tx_n = (x_n - anchor_x) / anchor_w ty_n = (y_n - anchor_y) / anchor_h Args: gt_quad_boxes: (tensor) bounding boxes of (xyxyxyxy), sized [#obj, 8]. labels: (tensor) object class labels, sized [#obj, ]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,8]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size, input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_rect_boxes = self._get_anchor_boxes( input_size) # (num_anchor, 8) anchor_quad_boxes = change_box_order(anchor_rect_boxes, "xywh2quad") # (num_anchor, 4) gt_rect_boxes = change_box_order(gt_quad_boxes, "quad2xyxy") ious = box_iou(anchor_rect_boxes, gt_rect_boxes) max_ious, max_ids = ious.max(1) # Each anchor box matches the largest iou with the gt box gt_quad_boxes = gt_quad_boxes[max_ids] # (num_gt_boxes, 8) gt_rect_boxes = gt_rect_boxes[max_ids] # (num_gt_boxes, 4) # for Rectangle boxes -> using in TextBoxes #gt_rect_boxes = change_box_order(gt_rect_boxes, "xyxy2xywh") #loc_rect_yx = (gt_rect_boxes[:, :2] - anchor_rect_boxes[:, :2]) / anchor_rect_boxes[:, 2:] #loc_rect_hw = torch.log(gt_rect_boxes[:, 2:] / anchor_rect_boxes[:, 2:]) # for Quad boxes -> using in TextBoxes++ anchor_boxes_hw = anchor_rect_boxes[:, 2:4].repeat(1, 4) loc_quad_yx = (gt_quad_boxes - anchor_quad_boxes) / anchor_boxes_hw # loc_targets = torch.cat([loc_rect_yx, loc_rect_hw, loc_quad_yx], dim=1) # (num_anchor, 12) loc_targets = loc_quad_yx cls_targets = labels[max_ids] cls_targets[max_ious < 0.5] = -1 # ignore (0.4~0.5) : -1 cls_targets[max_ious < 0.4] = 0 # background (0.0~0.4): 0 # positive (0.5~1.0) : 1 return loc_targets, cls_targets
def encode(self, labels, boxes, input_size=None, test=False): ''' 编码xml中的object格式为bounding boxes regression的格式 tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) 注意,这个方法输入的是单张图片的objects,所以使用的时候必须一张图片一张 图片的输入 args: labels: tensor, 每个gtbb的标签,size是[#box,] boxes: tensor, ground truth bounding boxes, (xmin, ymin, xmax, ymax),size是[#box, 4] input_size:int/tuple,输入图像的大小 test: 测试时使用; returns: cls_targets: tensor,每个anchor被赋予的标签,size是[#anchors, ], 其中的值0代表背景类,1-k表示k个分类,-1表示忽略的anchors loc_targets: tensor,每个anchor被赋予的bbr的标签,size是 [#anchors, 4],#anchors是所有特征图上的所有anchors ''' if input_size is None: input_size = self.input_size anchor_boxes = self.anchor_boxes else: if len(input_size) != 2: raise ValueError('TCT的input_size不是1920x1200,所以不能是None') input_size = torch.tensor(input_size, dtype=torch.float) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') # 计算每个anchor和每个gtbb间的iou,根据此来给标签 ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] if test: _, orders = max_ious.sort(0, True) loc_targets = change_box_order(anchor_boxes, 'xywh2xyxy')[orders] else: # 计算bbr的偏移量,即bbr的标签 loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] # 加1是为了空出0来给背景类 # 规定背景类,规定忽略的anchors cls_targets[max_ious < self.iou_thre] = 0 ignore = (max_ious > self.ignore_thres[0]) & \ (max_ious < self.ignore_thres[1]) cls_targets[ignore] = -1 # 这些anchors是不用的 if test: cls_targets = cls_targets[orders] return cls_targets, loc_targets
def random_crop(self, img, boxes, labels): '''Randomly crop the image and adjust the bbox locations. Args: img: (PIL.Image) image. boxes: (tensor) bbox locations, sized [#obj, 4]. labels: (tensor) bbox labels, sized [#obj,]. Returns: img: (PIL.Image) cropped image. selected_boxes: (tensor) selected bbox locations. labels: (tensor) selected bbox labels. ''' imw, imh = img.size while True: min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9]) if min_iou is None: return img, boxes, labels for _ in range(100): w = random.randrange(int(0.1 * imw), imw) h = random.randrange(int(0.1 * imh), imh) if h > 2 * w or w > 2 * h: continue x = random.randrange(imw - w) y = random.randrange(imh - h) roi = torch.Tensor([[x, y, x + w, y + h]]) center = (boxes[:, :2] + boxes[:, 2:]) / 2 # [N,2] roi2 = roi.expand(len(center), 4) # [N,4] mask = (center > roi2[:, :2]) & (center < roi2[:, 2:]) # [N,2] mask = mask[:, 0] & mask[:, 1] #[N,] if not mask.any(): continue selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) ious = box_iou(selected_boxes, roi) if ious.min() < min_iou: continue img = img.crop((x, y, x + w, y + h)) selected_boxes[:, 0].add_(-x).clamp_(min=0, max=w) selected_boxes[:, 1].add_(-y).clamp_(min=0, max=h) selected_boxes[:, 2].add_(-x).clamp_(min=0, max=w) selected_boxes[:, 3].add_(-y).clamp_(min=0, max=h) return img, selected_boxes, labels[mask]
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.tensor([input_size, input_size], dtype=torch.float32) if isinstance(input_size, int) \ else torch.tensor(input_size, dtype=torch.float32) anchor_boxes = self._get_anchor_boxes(input_size) if boxes.numel() == 0: # 0 is background class cls_targets = torch.zeros(anchor_boxes.size(0), dtype=torch.int64) loc_targets = torch.zeros_like(anchor_boxes, dtype=torch.float32) else: boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2]-anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:]/anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = labels[max_ids] cls_targets[max_ious < 0.5] = 0 # 0 is background class ignore = (max_ious > 0.4) & (max_ious < 0.5) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size, input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') # if ((boxes[0][2] * boxes[0][3]).numpy() >32 * 32 / 2) : # # print((boxes[0][2]*boxes[0][3]).numpy(),end='->') # sptj='True' # else: # sptj = 'False' # print('target locked -> ',sptj) ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious < 0.5] = 0 ignore = (max_ious > 0.4) & (max_ious < 0.5 ) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return cls_targets, loc_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Then we scale [tx,ty,tw,th] by [10,10,5,5] times to make loc_loss larger. Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (input_height, input_width). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. Reference: https://github.com/tensorflow/models/blob/master/object_detection/box_coders/faster_rcnn_box_coder.py ''' scale_factor = torch.Tensor([10,10,5,5]) # scale [tx,ty,tw,th] input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:,:2]-anchor_boxes[:,:2]) / anchor_boxes[:,2:] loc_wh = torch.log(boxes[:,2:]/anchor_boxes[:,2:]) loc_targets = torch.cat([loc_xy,loc_wh], 1) * scale_factor cls_targets = 1 + labels[max_ids] cls_targets[max_ious<0.4] = 0 ignore = (max_ious>0.4) & (max_ious<0.5) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def statistics_result(pred_boxes, label_boxes, iou_thresh=0.5): correct_num = 0 error_num = 0 miss_num = 0 for pbox in pred_boxes: is_exist = False for lbox in label_boxes: if lbox[4] == 0: continue iou = box_iou(pbox, lbox) if iou > iou_thresh: is_exist = True lbox[4] = 0 break if is_exist: correct_num += 1 else: error_num += 1 miss_num = len(label_boxes) - correct_num return correct_num, error_num, miss_num
def encode(self, boxes, labels, input_size): if isinstance(input_size, int): input_size = torch.Tensor([input_size, input_size]) else: input_size = torch.Tensor(input_size) anchor_boxes = self.get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') boxes = boxes.float() ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious < 0.4] = 0 cls_targets[(max_ious >= 0.4) & (max_ious < 0.5)] = -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) #print(anchor_boxes.shape) [49104,4] boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') #print(ious.shape) [num_anchors, obj] max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] #print(cls_targets.shape) torch.Size([49104]) cls_targets[max_ious < 0.5] = 0 #print(cls_targets) ignore = (max_ious > 0.4) & (max_ious < 0.5 ) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:,:2]-anchor_boxes[:,:2]) / anchor_boxes[:,2:] loc_wh = torch.log(boxes[:,2:]/anchor_boxes[:,2:]) loc_targets = torch.cat([loc_xy,loc_wh], 1) cls_targets = 1 + labels[max_ids] cls_targets[max_ious<0.5] = 0 ignore = (max_ious>0.4) & (max_ious<0.5) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): """ Encode target bounding boxes and class labels. we obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) :param boxes: (tensor) bounding boxes of (xmin, ymin, xmax, ymax), sized [#obj, 4]. :param labels: (tensor) object class labels, sized [#obj,]. :param input_size: (int/tuple) input size of the original image :return: loc_targets: (tensor) encoded bounding boxes, sized [#anchors, 4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. """ input_size = torch.Tensor([input_size, input_size]) if isinstance( input_size, int) else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) loc_targets = loc_targets / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) cls_targets = labels[max_ids] cls_targets[max_ious < 0.4] = 0 ignore = (max_ious >= 0.4) & (max_ious < 0.5 ) # ignore ious between [0:q.4, 0.5] cls_targets[ignore] = -1 return loc_targets, cls_targets
def encode(self, boxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' NEG = 10 input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) boxes = change_box_order(boxes, 'xyxy2xywh') ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] loc_xy = (boxes[:,:2] - anchor_boxes[:,:2]) / anchor_boxes[:,2:] loc_wh = torch.log(boxes[:,2:] / anchor_boxes[:,2:]) loc_targets = torch.cat([loc_xy,loc_wh], 1) # 这里,我们设置正负采样比例为 1:3 cls_targets = 1 + labels[max_ids] # 类别等于label加1, 最开始初始化为正类 # print(cls_targets) cls_targets[max_ious < 0.1] = 0 ignore = (max_ious > 0.05) & (max_ious < 0.1) cls_targets[ignore] = -1 # for now just mark ignored to -1 ''' cls_targets[max_ious < 0.1] = 0 # print("cls_targets shape:", cls_targets.shape) pos = cls_targets > 0 n_pos = pos.data.float().sum().item() # print(n_pos) n_neg = NEG * n_pos if n_pos != 0 else NEG n_neg = int(n_neg) # print('n_neg',n_neg) # print(max_ious.shape) max_ious = max_ious.numpy().astype(np.float) neg_index = np.where(max_ious < 0.1)[0] # print("neg_index shape", neg_index.size) # print("neg_index", neg_index) # neg_index = neg_index.squeeze(1) # neg_index = neg_index.numpy().astype(np.int) # print("neg_index numpy shape", neg_index.shape) if neg_index.shape[0] > n_neg: disable_index = np.random.choice( neg_index, size=(len(neg_index) - n_neg), replace=False) # disable_index = disable_index.unsqueeze(1) # print("disable_index",disable_index.shape) disabel_index = torch.from_numpy(disable_index).float() cls_targets[disable_index] = -1 # print("cls_targets",cls_targets) # pos_neg = cls_targets > -1 # exclude ignored anchors # print("pos_neg", pos_neg.data.float().sum().item()) # # ignore = (max_ious > 0.05) & (max_ious<0.01) # # cls_targets[ignore] = -1 # for now just mark ignored to -1 ''' return loc_targets, cls_targets
def target_label_generate(labels, gta, mc): """ generate target label matrix """ RF = mc.receptive_field (W, H) = (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT) Anchors = mc.Anchors #print(anchor_box) target_matirx = np.zeros((W / RF, H / RF, Anchors)) #load anchor box anchor_box = mc.Anchor_box #print(anchor_box.shape) #only valid anchors can be keep bbox_xy = utils.bboxtransform(anchor_box) #print(bbox_xy.shape) _allowed_border = mc._allowed_border inds_inside = np.where((bbox_xy[:, 0] >= -_allowed_border) & (bbox_xy[:, 1] >= -_allowed_border) & (bbox_xy[:, 2] < W + _allowed_border) & # width (bbox_xy[:, 3] < H + _allowed_border) # height )[0] out_inside = np.where((bbox_xy[:, 0] < -_allowed_border) & (bbox_xy[:, 1] < -_allowed_border) & (bbox_xy[:, 2] >= W + _allowed_border) & # width (bbox_xy[:, 3] >= H + _allowed_border) # height )[0] valid_anchors = anchor_box[inds_inside] #print(valid_anchors.shape) anchors = utils.coord2box(valid_anchors) groundtruth = utils.coord2box(gta) #print(len(anchors), len(groundtruth)) num_of_anchors = len(anchors) num_of_gta = len(groundtruth) overlaps_table = np.zeros((num_of_anchors, num_of_gta)) for i in range(num_of_anchors): for j in range(num_of_gta): overlaps_table[i, j] = utils.box_iou(anchors[i], groundtruth[j]) #print(overlaps_table) #argmax overlaps for each groundtruth gt_argmax_overlaps = overlaps_table.argmax(axis=0) argmax_overlaps = overlaps_table.argmax(axis=1) #overlaps groundtruth gt_max_overlaps = overlaps_table[gt_argmax_overlaps, np.arange(overlaps_table.shape[1])] gt_argmax_overlaps = np.where(overlaps_table == gt_max_overlaps)[0] #used this to select postive/ negative/ no care samples max_overlaps = overlaps_table[np.arange(len(valid_anchors)), argmax_overlaps] target_labels = pick_samples(max_overlaps, gt_argmax_overlaps, mc) #subsampling, default subsampling methods is random sample target_labels = subsampling(target_labels, mc) #bbox delta label target_delta, bbox_in_w, bbox_out_w = target_bbox(out_inside, valid_anchors, gta[argmax_overlaps, :], target_labels, mc) #UNMAP TO original feature images num_anchor_box_per_grid = mc.Anchors total_anchors = num_anchor_box_per_grid * H / RF * W / RF labels = unmap2original(target_labels, total_anchors, inds_inside, fill=-1) bbox_targets = unmap2original(target_delta, total_anchors, inds_inside, fill=0) bbox_inside_weights = unmap2original(bbox_in_w, total_anchors, inds_inside, fill=0) bbox_outside_weights = unmap2original(bbox_out_w, total_anchors, inds_inside, fill=0) labels = labels.reshape( (mc.IMAGE_HEIGHT // RF, mc.IMAGE_WIDTH // RF, mc.Anchors)) rpn_labels = labels #print(rpn_labels.shape) # bbox_targets bbox_targets = bbox_targets \ .reshape((mc.IMAGE_HEIGHT//RF , mc.IMAGE_WIDTH//RF , mc.Anchors * 4)) rpn_bbox_targets = bbox_targets # bbox_inside_weights bbox_inside_weights = bbox_inside_weights \ .reshape((mc.IMAGE_HEIGHT//RF , mc.IMAGE_WIDTH//RF , mc.Anchors * 4)) #assert bbox_inside_weights.shape[2] == height #assert bbox_inside_weights.shape[3] == width rpn_bbox_inside_weights = bbox_inside_weights # bbox_outside_weights bbox_outside_weights = bbox_outside_weights \ .reshape((mc.IMAGE_HEIGHT//RF , mc.IMAGE_WIDTH//RF , mc.Anchors * 4)) rpn_bbox_outside_weights = bbox_outside_weights return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
def encode(self, boxes, center_points, labels, colls_with, dimensions, bins, sines, coses, input_size): bins = bins.squeeze(1) sines = sines.squeeze(1) coses = coses.squeeze(1) '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. colls_with: (tensor) whether the vehicle collides with the player agent, sized [#obj] (binary) dimensions: (tensor), sized [#obj, 3] labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \ else torch.Tensor(input_size) anchor_boxes = self._get_anchor_boxes(input_size) try: boxes = change_box_order(boxes, 'xyxy2xywh') except: assert (0) print( "a vehicle-free frame, which should be eliminated in a clean dataset" ) boxes = torch.Tensor([[0., 0., 0., 0.]]) colls_with = torch.Tensor([0.]) dimensions = torch.Tensor([[0., 0., 0.]]) sines = torch.Tensor([0.]) coses = torch.Tensor([0.]) bins = torch.Tensor([0.]) labels = torch.Tensor([0.]) # orientations = torch.Tensor([0.]) colls_with = torch.Tensor(colls_with) colls_with = colls_with dimensions = dimensions # orientations = orientations.float() ious = box_iou(anchor_boxes, boxes, order='xywh') max_ious, max_ids = ious.max(1) # select matching instance boxes = boxes[max_ids] center_points = center_points[max_ids] colls_with = colls_with[max_ids] dimensions = dimensions[max_ids] cls_targets = labels[max_ids] bins = bins[max_ids] sines = sines[max_ids] coses = coses[max_ids] # orientations = orientations[max_ids] # build offset referring to target anchors # print(boxes[0,0], "before more") loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) # print(loc_xy[0, 0], loc_wh[0,0], "before") loc_targets = torch.cat([loc_xy, loc_wh], 1) # sized [num_anchor, 4] center_xy = (center_points[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] center_depth = center_points[:, 2].unsqueeze(1) center_targets = torch.cat([center_xy, center_depth], 1) # sized [num_anchor, 3] # filter invalid or negative instance sines[max_ious < 0.5] = 0 coses[max_ious < 0.5] = 0 bins[max_ious < 0.5] = 0 cls_targets[max_ious < 0.5] = 0 colls_with[max_ious < 0.5] = 0 dimensions[max_ious < 0.5] = 0 # orientations[max_ious<0.5] = 0 # ignore some not enough overlapped instances ignore = (max_ious > 0.4) & (max_ious < 0.5 ) # ignore ious between [0.4,0.5] cls_targets[ignore] = -1 # for now just mark ignored to -1 colls_with[ignore] = -1 dimensions[ignore] = -1 bins[ignore] = -1 # colls_with[ignore] = -1 # print(loc_targets[0, 0], "in encoder") return loc_targets, cls_targets, center_targets, colls_with, dimensions, bins, sines, coses
def encode(self, boxes, labels, input_size): ''' Encode target bounding boxes and class labels.fm_d Implement the Faster RCNN box coder in 3D image: tz = (z - anchor_z) / anchor_d ty = (y - anchor_y) / anchor_h tx = (x - anchor_x) / anchor_w td = log(d / anchor_d) th = log(h / anchor_h) tx = log(w / anchor_w) Args: boxes: (tensor) bounding boxes of (zmin, ymin, xmin, zmax, ymax, xmax), sized [#obj, 6] labels: (tensor) object class labels, sized [#obj,] input_size: (int/tuple) model input size of (d, h, w) Returns: loc_targets: (tensor) encoded boudning boxes, sized [#anchors, 6] cls_targets: (tensor) encoded class labels, sized [#anchors,] ''' if isinstance(input_size, int): input_size = torch.Tensor([input_size, input_size, input_size]) else: input_size = torch.Tensor(input_size) anchor_boxes = self.get_anchor_boxes(input_size) # (z, y, x, d, h, w) boxes = change_box_order(boxes, 'zyxzyx2zyxdhw') #print(boxes.size()) ious = box_iou(anchor_boxes, boxes, order="zyxdhw") # num_anchors x objects max_ious, max_ids = ious.max( 1 ) # find the best object for each anchor, return ious_value and object index best_ious, best_ids = ious.max( 0 ) # find the best anchor for each object, return ious_value and anchor index boxes = boxes[max_ids] #print(boxes.size()) loc_zyx = (boxes[:, :3] - anchor_boxes[:, :3]) / anchor_boxes[:, 3:] loc_dhw = boxes[:, 3:] / anchor_boxes[:, 3:] loc_dhw = loc_dhw.numpy() loc_dhw = np.log(loc_dhw) loc_dhw = torch.from_numpy(loc_dhw) loc_targets = torch.cat([loc_zyx, loc_dhw], 1) cls_targets = 1 + labels[ max_ids] # the background class = 0, so +1 for object classes #print(cls_targets.size()) cls_targets[max_ious < 0.4] = 0 for i in range(best_ids.size()[0]): cls_targets[best_ids[i]] = 1 + labels[i] ig_num = cls_targets.size()[0] - 100 cls_targets_array = cls_targets.numpy() neg_idx = np.where(cls_targets_array == 0) if ig_num > len(neg_idx[0]): ig_num -= (ig_num - len(neg_idx[0])) ig_idx = np.random.choice(neg_idx[0], ig_num, replace=False) cls_targets_array[ig_idx] = -1 cls_targets = torch.from_numpy(cls_targets_array) ''' ignore = (max_ious > 0.15) & (max_ious < 0.4) cls_targets[ignore] = -1 for i in range(best_ids.size()[0]): cls_targets[best_ids[i]] = 1 + labels[i] ''' return loc_targets, cls_targets
def mAP(true_cls, true_loc, pred_cls, pred_loc, iou_thre=0.5, num_class=2, ap_func=compute_ap): ''' 计算mAP,参考的是 https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/ utils/eval.py args: true_cls,list of tensors,每个tensor的维度是(#obj_i,),值是0,1,..., 代表真实的每张图片的多个objs的类别; true_loc,list of tensors,每个tensor的维度是(#obj, 4),mode=xyxy, 真实的每张图片的每个ground truth bounding boxes的坐标; pred_cls,list of tensor,每个tensor的维度是(#anchor_remain, #class), 预测的在每张图片上的预测框的得分,anchor_remain表示这是经过卡阈 值、nms等剩下的预测框; pred_loc,list of tensor,每个tensor的维度是(#anchor_remain, 4), mode=xyxy,预测的框的loc,注意,这里list的len就是图片的数量; iou_thre,iou_thre,默认是0.5,用于匹配预测框和gtbb; num_class,分类个数; ap_func,使用precision和recall计算ap的函数,默认使用的是compute_ap,这个 相比于sklearn.metrics.auc得到的结果要好一些; returns: APs,每个类的AP值; mAP,输出的是一个float的scalar,所有类的平均AP。 ''' # 把true objects转到pred objects相关的设备中 device = pred_cls[0].device true_cls = [tc.to(device) for tc in true_cls] true_loc = [tl.to(device) for tl in true_loc] # 储存每一类的ap值 aps = [] num_imgs = len(true_cls) # 因为输入的是预测的每个类的分数,需要取最大得到预测的类和这个类的score pred_score = [] pred_class = [] for t in pred_cls: # 如果存在空的pred_cls(即没有预测框,直接使用max方法会报错) if len(t) == 0: pred_score.append(t.new_empty(0)) pred_class.append(torch.zeros(0, dtype=torch.long, device=t.device)) else: t_s, t_c = t.max(dim=1) pred_score.append(t_s) pred_class.append(t_c) # 对每个类分别计算ap for c in range(num_class): # 创建ndarray来储存是否是一个true positive的预测 tp = np.zeros((0, )) # 记录这一类一共有多少个true objects,用于计算recall num_true_objs = 0.0 # 记录所有的已经变换了顺序的scores,用于之后计算recall和precision all_scores_c_order = [] for i in range(num_imgs): # 一张图片中属于此类的gtbb true_c_mask = true_cls[i] == c num_true_objs += true_c_mask.sum() # 统计总的true的数量来计算recall true_loc_i_c = true_loc[i][true_c_mask] # 一张图片中属于此类的预测boxes pred_c_mask = pred_class[i] == c # 对一张图片中的所有预测框根据起scores进行排序,以便在选择匹配的gtbb # 时能够保证总是预测scores最大的匹配到gtbb,这里需要进行排序的有 # score、classes和locs pred_score_i_c, i_c_order = pred_score[i][pred_c_mask].sort( dim=0, descending=True) pred_loc_i_c = pred_loc[i][pred_c_mask][i_c_order] all_scores_c_order.append(pred_score_i_c) # 用于记录已经匹配到的gtbb的序号,每张图片重新记录 detected_true_boxes = [] # 如果这个类的预测框数量为0,则此循环不会运行,fp和tp是长度为0的 # array;如果此类的预测框数量不是0而是N,则会使fp和tp的长度增加 # 至N。 for d in pred_loc_i_c: # 如果这张图片上gtbb并没有这个类,则所有这类的预测都被看做是 # false positive if true_loc_i_c.size(0) == 0: tp = np.append(tp, 0) continue # 计算预测框和所有gtbb的IoU,并去最大的一个作为此预测框的预测对象 ious = box_iou(d.unsqueeze(0), true_loc_i_c).squeeze(0) max_iou, max_idx = ious.max(dim=0) # 如果这个最大的IoU大于thre,则认为此预测框针对的正是这个gtbb, # 则认为这是个true postive(实际上认为是true postive还有一个 # 条件是这个预测框的scores大于指定的阈值,但我们需要移动这个阈 # 值来构建不同的recall和其对应的precision,所以这里先认为 # 只要是匹配上了就是1,当之后变化阈值的时候只要把score小于阈值 # tp设为0、fp设为1即可,而本来都没有匹配上的永远是0) # 另外,记录这一张图片上已经匹配过的gtbb,之后再进行匹配的时候就 # 不进行匹配了(这里有一些小问题,即我们考虑的时候没有先考虑score # 比较大的框,这样可能导致因为score比较小的框先把gtbb给占了而 # 导致可能匹配的更好的score更高的预测框被认为是false postive, # 这样可能会拉低ap) if max_iou >= iou_thre and max_idx not in detected_true_boxes: tp = np.append(tp, 1) detected_true_boxes.append(max_idx) else: tp = np.append(tp, 0) # 如果对于某一类,所有图片的gtbb中都没有这一类,则认为此类的ap是0,? if num_true_objs == 0.0: aps.append(0.) continue # 依据score进行再次进行总排序,计算每一类的recall、precision _, order = torch.cat(all_scores_c_order, dim=0).sort(dim=0, descending=True) order = order.cpu().numpy() tp = tp[order] fp = 1 - tp # 逐个计算array的前n个元素中fp和tp的个数,这个可以看做在每个元素的间隔间 # 变化阈值来使的低于此阈值的所有都被预测是0,则计算postive(不管是fp # 还是tp)只需要考虑前面就可以了。 fp = fp.cumsum() tp = tp.cumsum() # 计算recall和precision recall = tp / num_true_objs.item() # --这里可能出现预测的里没有postive(比如我们把阈值卡的特别高的时候), # 当然这是tp也是0,但分母=0会使得无法计算,所以需要加一个eps来避免 precision = tp / np.maximum((tp + fp), np.finfo(np.float64).eps) aps.append(ap_func(recall, precision)) return aps, np.mean(aps)
def boxes_to_y(true_boxes, anchors, num_classes, image_wh, num_anchors=3): """ transfer true boxes to yolo y format. Arguments: true_boxes: bbox absolute value in image_wh of one image, value as (xmin, ymin, xmax, ymax, class), shape(?, 5). anchors: anchor boxe size array, shape(num_anchors, 2). num_classes: total class num. image_wh: true input image size of (w, h). Returns: y_true: list of yolo feature map fomat, shape(grid w, grid h, num_anchors, 5+num_classes), box xywh info normalize to (0, 1). """ num_layers = anchors.shape[0] // num_anchors box_class = true_boxes[:, 4].astype(np.int32) xymin, xymax = true_boxes[:, 0:2], true_boxes[:, 2:4] input_size = np.array([416, 416]) # calculate box center xy and wh, range(0, 416). boxes_wh = xymax - xymin boxes_xy = xymin + boxes_wh // 2 # normalize to range(0, 1) boxes_xy = boxes_xy / image_wh # grid shape # e.g. [input_size//32, input_size//16, input_size//8] -> [[13, 13], [26, 26], [52, 52]] grid_wh = [input_size // (2**(5 - i)) for i in range(num_layers)] grid_boxes_xy = [boxes_xy * grid_wh[i] for i in range(num_layers) ] # to grid scale, range(0, grid_wh). grid_index = [np.floor(grid_boxes_xy[i]) for i in range(num_layers)] # boxes_xy = [(boxes_xy[i] - grid_index[i]) for i in range(num_layers)] # size respect to one grid, range(0, 1). # true size of xy min max cordinates relative to grid left top corner. anchor_xymax = anchors / 2 anchor_xymin = -anchor_xymax box_xymax = boxes_wh / 2 box_xymin = -box_xymax # create y_true. y_true = [ np.zeros( (1, grid_wh[i][1], grid_wh[i][0], num_anchors, 5 + num_classes), dtype='float32') for i in range(num_layers) ] # iterate on each box num_boxes = true_boxes.shape[0] for box_index in range(num_boxes): # calculate iou. box1 = np.concatenate([box_xymin[box_index], box_xymax[box_index]]).reshape(1, -1) box2 = np.concatenate([anchor_xymin, anchor_xymax], axis=-1) iou = box_iou(box1, box2) # select the best anchor anchor_index = np.argmax(iou) layer_index = num_layers - 1 - anchor_index // num_anchors layer_anchor_index = anchor_index % num_anchors box_xy = boxes_xy[box_index] # shape(2,) # box_wh = boxes_wh[box_index]/anchors[anchor_index] # shape(2,) box_wh = boxes_wh[box_index] / image_wh # shape(2,), range(0, 1) # fill in y_true. w = grid_index[layer_index][box_index, 0].astype('int32') h = grid_index[layer_index][box_index, 1].astype('int32') y_true[layer_index][0, h, w, layer_anchor_index, :2] = box_xy y_true[layer_index][0, h, w, layer_anchor_index, 2:4] = box_wh y_true[layer_index][0, h, w, layer_anchor_index, 4:5] = 1 y_true[layer_index][0, h, w, layer_anchor_index, 5 + box_class[box_index]] = 1 return y_true
def forward(self, preds, loc_targets, cls_targets, box_targets): ''' Args: preds: (tensor) model outputs, sized [batch_size,150,fmsize,fmsize]. loc_targets: (tensor) loc targets, sized [batch_size,5,4,fmsize,fmsize]. cls_targets: (tensor) conf targets, sized [batch_size,5,20,fmsize,fmsize]. box_targets: (list) box targets, each sized [#obj,4]. Returns: (tensor) loss = SmoothL1Loss(loc) + SmoothL1Loss(iou) + SmoothL1Loss(cls) ''' batch_size, _, fmsize, _ = preds.size() preds = preds.view(batch_size, 5, 4 + 1 + 20, fmsize, fmsize) ### loc_loss xy = preds[:, :, :2, :, :].sigmoid() # x->sigmoid(x), y->sigmoid(y) wh = preds[:, :, 2:4, :, :].exp() loc_preds = torch.cat([xy, wh], 2) # [N,5,4,13,13] pos = cls_targets.max(2)[0].squeeze() > 0 # [N,5,13,13] num_pos = pos.data.long().sum() mask = pos.unsqueeze(2).expand_as( loc_preds) # [N,5,13,13] -> [N,5,1,13,13] -> [N,5,4,13,13] loc_loss = F.smooth_l1_loss(loc_preds[mask], loc_targets[mask], size_average=False) ### iou_loss iou_preds = preds[:, :, 4, :, :].sigmoid() # [N,5,13,13] iou_targets = Variable(torch.zeros( iou_preds.size()).cuda()) # [N,5,13,13] box_preds = self.decode_loc(preds[:, :, :4, :, :]) # [N,5,4,13,13] box_preds = box_preds.permute(0, 1, 3, 4, 2).contiguous().view(batch_size, -1, 4) # [N,5*13*13,4] for i in range(batch_size): box_pred = box_preds[i] # [5*13*13,4] box_target = box_targets[i] # [#obj, 4] iou_target = box_iou(box_pred, box_target) # [5*13*13, #obj] iou_targets[i] = iou_target.max(1)[0].view(5, fmsize, fmsize) # [5,13,13] mask = Variable(torch.ones( iou_preds.size()).cuda()) * 0.1 # [N,5,13,13] mask[pos] = 1 iou_loss = F.smooth_l1_loss(iou_preds * mask, iou_targets * mask, size_average=False) ### cls_loss cls_preds = preds[:, :, 5:, :, :] # [N,5,20,13,13] cls_preds = cls_preds.permute(0, 1, 3, 4, 2).contiguous().view( -1, 20) # [N,5,20,13,13] -> [N,5,13,13,20] -> [N*5*13*13,20] cls_preds = F.softmax(cls_preds) # [N*5*13*13,20] cls_preds = cls_preds.view(batch_size, 5, fmsize, fmsize, 20).permute( 0, 1, 4, 2, 3) # [N*5*13*13,20] -> [N,5,20,13,13] pos = cls_targets > 0 cls_loss = F.smooth_l1_loss(cls_preds[pos], cls_targets[pos], size_average=False) print('%f %f %f' % (loc_loss.data[0] / num_pos, iou_loss.data[0] / num_pos, cls_loss.data[0] / num_pos), end=' ') return (loc_loss + iou_loss + cls_loss) / num_pos