def bbox_iou(bbox1, bbox2): """ :param bbox1: [13, 13, 5, 4] / [x, y, w, h]; :param bbox2: [13, 13, 5, 4] / [x, y, w, h]; :return: [13, 13, 5]; """ bbox1_area = bbox1[..., 2] * bbox1[..., 3] bbox2_area = bbox2[..., 2] * bbox2[..., 3] # assert bbox1.shape == bbox2.shape bbox2 = xywh2xyxy(bbox2) bbox1 = xywh2xyxy(bbox1) # [13, 13, 5] & [13, 13, 5] -> [13, 13, 5] intersection_xmin = torch.max(bbox1[..., 0], bbox2[..., 0]) intersection_ymin = torch.max(bbox1[..., 1], bbox2[..., 1]) intersection_xmax = torch.min(bbox1[..., 2], bbox2[..., 2]) intersection_ymax = torch.min(bbox1[..., 3], bbox2[..., 3]) # [13, 13, 5] & [13, 13, 5] -> [13, 13, 5] intersection_w = torch.max(intersection_xmax - intersection_xmin, torch.tensor(0., device=opt.device)) intersection_h = torch.max(intersection_ymax - intersection_ymin, torch.tensor(0., device=opt.device)) intersection_area = intersection_w * intersection_h # [13, 13, 5] & ([13, 13, 5] & [13, 13, 5] & [13, 13, 5]) -> [13, 13, 5] ious = intersection_area / (bbox1_area + bbox2_area - intersection_area + 1e-10) # ious shape: [13, 13, 5] return ious
def dump_labelme(df, output_dir): os.makedirs(output_dir, exist_ok=True) # directly save results to output folder for _, row in df.iterrows(): file_name = row['file_name'] bboxes = row['bboxes'] img_width = row['width'] img_height = row['height'] _template = { "version": "1.0.0", "flags": {}, "shapes": [], "imagePath": file_name, "imageData": None, "imageHeight": img_height, "imageWidth": img_width, } for bbox in bboxes: x1, y1, x2, y2 = xywh2xyxy(bbox) _template['shapes'].append({ 'label': 'person', 'points': [[x1, y1], [x2, y2]], "group_id": None, "shape_type": "rectangle", "flags": {}, }) output_filename = join(output_dir, file_name.rstrip('.jpg') + '.json') with open(output_filename, 'w') as json_fp: json_str = json.dumps(_template) json_fp.write(json_str)
def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_boxes('../coco128') # Convert detection dataset into classification dataset, with one directory per class path = Path(path) # images dir shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing files = list(path.rglob('*.*')) n = len(files) # number of files for im_file in tqdm(files, total=n): if im_file.suffix[1:] in img_formats: # image im = cv2.imread(str(im_file))[..., ::-1] # BGR to RGB h, w = im.shape[:2] # labels lb_file = Path(img2label_paths([str(im_file)])[0]) if Path(lb_file).exists(): with open(lb_file, 'r') as f: lb = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) # labels for j, x in enumerate(lb): c = int(x[0]) # class f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg' # new filename if not f.parent.is_dir(): f.parent.mkdir(parents=True) b = x[1:] * [w, h, w, h] # box # b[2:] = b[2:].max() # rectangle to square b[2:] = b[2:] * 1.2 + 3 # pad b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image b[[1, 3]] = np.clip(b[[1, 3]], 0, h) assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}'
def comput_loss(proc_pred, annotations_gt, targets, iou_th=0.5, giou_ratio=0.5): #procpred = process_preds(model_out[0], int(np.sqrt(out.shape[1])) , 256, 56) boxloss, closs, objloss = torch.tensor([0]).float(), torch.tensor( [0]).float(), torch.tensor([0]).float() for j in range(len(proc_pred)): for i, gt in enumerate(annotations_gt[j]): # get ious+ ious = bbox_iou(gt.float(), xywh2xyxy(procpred[j, :, :4]).float()) # get reelvant predictions pertinent = torch.where(ious > iou_th)[0] if len(pertinent): best_id = torch.max(ious[pertinent], 0)[1] best_bb = procpred[j, best_id, :] closs += pred_criterion(best_bb[5:].unsqueeze(0), torch.tensor(targets[i])) boxloss += (1 - ious[pertinent]).mean() trgt_objectness = ( 1 - giou_ratio) + giou_ratio * ious.detach().clamp(0) objloss += obj_criterion(procpred[j, ..., 4], trgt_objectness) loss = 2 * boxloss + closs + 2 * objloss loss_print = dict(box=boxloss.detach(), pred=closs.detach(), obj=objloss.detach()) return loss, loss_print
def _iou(gt_boxes, base_anchors): """ :param gt_boxes: [M, 4] / [ctr_x, ctr_y, w, h] :param base_anchors: [N, 2] / [w, h] :return: [M,] """ dummy_anchors = np.zeros(shape=[len(base_anchors), 4]) dummy_gt_boxes = np.zeros(shape=[len(gt_boxes), 4]) dummy_anchors[:, 2:] = base_anchors dummy_gt_boxes[:, 2:] = gt_boxes[:, 2:] dummy_anchors = xywh2xyxy(dummy_anchors) dummy_gt_boxes = xywh2xyxy(dummy_gt_boxes) ious = iou_general(dummy_gt_boxes[:, None, :], dummy_anchors) # fig, ax = plt.subplots(1) # plot_boxes(dummy_anchors, ax, 'r') # plot_boxes(dummy_gt_boxes, ax, 'b') # plot_boxes(dummy_anchors[ious.argmax(axis=-1)], ax, 'g') # plt.show() return ious
def decode(self, preds, img_size=None): ''' 对得到的self.S x self.S x self.pred_c大小的preds进行decode,得到 object的类别和坐标 args: preds,self.S x self.S x self.pred_c,是网络的输出; img_size,图片的大小,如果不是None,则得到的坐标是真实的坐标,如果 为None,则得到的坐标是归一化到0-1区间的; returns: res_c,预测的类别; res_s,用于nms使用的score,其代表的含义是在确定为object的条件下属于 此类的概率,和与真实对象IoU的乘积; res_l,预测框的标签; (以上得到的都是tensor) ''' confidence = preds[..., [4, 9]] mask1 = confidence > self.conf_thre mask2 = confidence == confidence.max() mask = (mask1 + mask2).gt(0) if mask.sum() == 0: return None indx = mask.nonzero()[:, :2][:, [1, 0]].float() # lt = indx * self.cell_size # indx[:, 2] = indx[:, 2] * p_shape = list(preds.shape) preds_locs_conf = preds[:, :, :(self.B * 5)].view( *p_shape[:-1], self.B, 5) preds_locs = preds_locs_conf[..., :4] preds_conf = preds_locs_conf[..., 4] # 计算输出的class应该有的维度,这里将其重复两次分别对应两个bboxes pcs = list(preds.shape[:2]) + [self.B, self.C] preds_class = preds[:, :, (self.B * 5):].unsqueeze(2).expand(*pcs) remain_locs = xywh2xyxy(preds_locs[mask], self.cell_size, indx) remain_conf = preds_conf[mask] remain_class = preds_class[mask] # 进行nms,使用的是预测最大的类别的概率*confidence来作为score # 其=pr(class_i)*IoU probs, cls_index = remain_class.max(1) scores = probs * remain_conf keep = nms(remain_locs, scores, self.nms_thre) res_c, res_s, res_l = cls_index[keep], scores[keep], remain_locs[keep] if img_size is not None: res_l = res_l * torch.tensor( [list(img_size) * 2], dtype=torch.float, device=res_l.device) return res_c, res_s, res_l
def forward(self, x): """ Args x: (Tensor) detection feature map, with size [bs, num_bboxes, 5 + nC] Returns detections: (Tensor) detection result with size [num_bboxes, [image_batch_idx, 4 offsets, p_obj, max_conf, cls_idx]] """ bs, num_bboxes, num_attrs = x.size() detections = torch.Tensor().cuda() for idx in range(bs): pred = x[idx] try: non_zero_pred = pred[pred[:, 4] > self.conf_thresh] non_zero_pred[:, :4] = xywh2xyxy(non_zero_pred[:, :4]) max_score, max_idx = torch.max(non_zero_pred[:, 5:], 1) max_idx = max_idx.float().unsqueeze(1) max_score = max_score.float().unsqueeze(1) non_zero_pred = torch.cat( (non_zero_pred[:, :5], max_score, max_idx), 1) classes = torch.unique(non_zero_pred[:, -1]) except Exception: # no object detected continue for cls in classes: cls_pred = non_zero_pred[non_zero_pred[:, -1] == cls] conf_sort_idx = torch.sort(cls_pred[:, 5], descending=True)[1] cls_pred = cls_pred[conf_sort_idx] max_preds = [] while cls_pred.size(0) > 0: max_preds.append(cls_pred[0].unsqueeze(0)) ious = IoU(max_preds[-1], cls_pred) cls_pred = cls_pred[ious < self.nms_thresh] if len(max_preds) > 0: max_preds = torch.cat(max_preds).data batch_idx = max_preds.new(max_preds.size(0), 1).fill_(idx) seq = (batch_idx, max_preds) detections = torch.cat( seq, 1) if detections.size(0) == 0 else torch.cat( (detections, torch.cat(seq, 1))) return detections
def loss(self, predictions, targets, stats): assert type(predictions) == list loss = {} for i, (p, t) in enumerate(zip(predictions, targets)): assert p.shape == t.shape l = {} batch_size = t.shape[0] t = t.permute(0, 2, 3, 1) p = p.permute(0, 2, 3, 1) t = t.contiguous().view(batch_size, -1, self.num_features) p = p.contiguous().view(batch_size, -1, self.num_features) img_idx = torch.arange(batch_size, dtype=torch.float, device=self.device) img_idx = img_idx.reshape(-1, 1) * p.shape[2] t[:, :, 0] += 2. * img_idx p[:, :, 0] += 2. * img_idx img_idx = torch.arange(batch_size, dtype=torch.float, device=self.device) img_idx = img_idx.reshape(-1, 1) * p.shape[1] t[:, :, 1] += 2. * img_idx p[:, :, 1] += 2. * img_idx t = t.contiguous().view(-1, self.num_features) p = p.contiguous().view(-1, self.num_features) obj_mask = torch.nonzero(t[:, 4]).flatten() num_obj = len(obj_mask) if obj_mask.numel() > 0: p_xyxy = xywh2xyxy(p[:, :4].detach()) t_xyxy = xywh2xyxy(t[obj_mask, :4]) all_ious = jaccard(p_xyxy, t_xyxy) ious, _ = torch.max(all_ious, dim=1) stats['avg_obj_iou'].append( all_ious[obj_mask].diag().mean().item()) mask = torch.nonzero(ious > self.noobj_iou_threshold).squeeze() t[mask, 4] = 1. noobj_mask = torch.nonzero(t[:, 4] == 0.).squeeze() l['coord'] = nn.MSELoss(reduction='sum')(p[obj_mask, 0], t[obj_mask, 0]) l['coord'] += nn.MSELoss(reduction='sum')(p[obj_mask, 1], t[obj_mask, 1]) l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[obj_mask, 2]), torch.sqrt(t[obj_mask, 2])) l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[obj_mask, 3]), torch.sqrt(t[obj_mask, 3])) l['coord'] *= LAMBDA_COORD / batch_size if self.iteration * self.batch_size < 12800: l['bias'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 0], t[noobj_mask, 0]) l['bias'] += nn.MSELoss(reduction='sum')(p[noobj_mask, 1], t[noobj_mask, 1]) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[noobj_mask, 2]), torch.sqrt(t[noobj_mask, 2])) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[noobj_mask, 3]), torch.sqrt(t[noobj_mask, 3])) l['bias'] *= 0.1 / batch_size p[obj_mask, 5:] = F.log_softmax(p[obj_mask, 5:], dim=-1) t_long = torch.argmax(t[obj_mask, 5:], dim=1) if USE_CROSS_ENTROPY: l['class'] = nn.NLLLoss(reduction='sum')(p[obj_mask, 5:], t_long) else: l['class'] = nn.MSELoss(reduction='sum')(torch.exp( p[obj_mask, 5:]), t[obj_mask, 5:]) l['class'] *= LAMBDA_CLASS / batch_size stats['avg_class'].append( torch.exp(p[obj_mask, 5 + t_long]).mean().item()) # l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4], # all_ious[obj_mask, torch.arange(num_obj)].detach()) l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4], t[obj_mask, 4]) l['object'] *= LAMBDA_OBJ / batch_size stats['avg_pobj'].append(p[obj_mask, 4].mean().item()) l['no_object'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 4], t[noobj_mask, 4]) l['no_object'] *= LAMBDA_NOOBJ / batch_size stats['avg_pnoobj'].append(p[noobj_mask, 4].mean().item()) else: l['object'] = torch.tensor([0.], device=self.device) l['coord'] = torch.tensor([0.], device=self.device) l['class'] = torch.tensor([0.], device=self.device) l['no_object'] = LAMBDA_NOOBJ / batch_size * nn.MSELoss( reduction='sum')(p[:, 4], t[:, 4]) if self.iteration * self.batch_size < 12800: l['bias'] = nn.MSELoss(reduction='sum')(p[:, 0], t[:, 0]) l['bias'] += nn.MSELoss(reduction='sum')(p[:, 1], t[:, 1]) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:, 2]), torch.sqrt(t[:, 2])) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:, 3]), torch.sqrt(t[:, 3])) l['bias'] *= 0.1 / batch_size l['total'] = (l['coord'] + l['class'] + l['object'] + l['no_object']) for k, v, in l.items(): try: loss[k] = loss[k] + v except KeyError: loss[k] = v return loss, stats
def __init__(self, path, img_size=416, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, cache_images=False, single_cls=False, pad=0.0): try: path = str(Path(path)) # os-agnostic parent = str(Path(path).parent) + os.sep if os.path.isfile(path): # file with open(path, 'r') as f: f = f.read().splitlines() f = [ x.replace('./', parent) if x.startswith('./') else x for x in f ] # local to global path elif os.path.isdir(path): # folder f = glob.iglob(path + os.sep + '*.*') else: raise Exception('%s does not exist' % path) self.img_files = [ x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats ] except: raise Exception('Error loading data from %s. See %s' % (path, help_url)) n = len(self.img_files) assert n > 0, 'No images found in %s. See %s' % (path, help_url) bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index nb = bi[-1] + 1 # number of batches self.n = n # number of images self.batch = bi # batch index of image self.img_size = img_size self.augment = augment self.hyp = hyp self.image_weights = image_weights self.rect = False if image_weights else rect self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) # Define labels self.label_files = [ x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt') for x in self.img_files ] # Read image shapes (wh) sp = path.replace('.txt', '') + '.shapes' # shapefile path try: with open(sp, 'r') as f: # read existing shapefile s = [x.split() for x in f.read().splitlines()] assert len(s) == n, 'Shapefile out of sync' except: s = [ exif_size(Image.open(f)) for f in tqdm(self.img_files, desc='Reading image shapes') ] np.savetxt(sp, s, fmt='%g') # overwrites existing (if any) self.shapes = np.array(s, dtype=np.float64) # Rectangular Training https://github.com/ultralytics/yolov3/issues/232 if self.rect: # Sort by aspect ratio s = self.shapes # wh ar = s[:, 1] / s[:, 0] # aspect ratio irect = ar.argsort() self.img_files = [self.img_files[i] for i in irect] self.label_files = [self.label_files[i] for i in irect] self.shapes = s[irect] # wh ar = ar[irect] # Set training image shapes shapes = [[1, 1]] * nb for i in range(nb): ari = ar[bi == i] mini, maxi = ari.min(), ari.max() if maxi < 1: shapes[i] = [maxi, 1] elif mini > 1: shapes[i] = [1, 1 / mini] self.batch_shapes = np.ceil( np.array(shapes) * img_size / 32. + pad).astype(np.int) * 32 # Cache labels self.imgs = [None] * n self.labels = [np.zeros((0, 5), dtype=np.float32)] * n create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate np_labels_path = str(Path( self.label_files[0]).parent) + '.npy' # saved labels in *.npy file if os.path.isfile(np_labels_path): s = np_labels_path # print string x = np.load(np_labels_path, allow_pickle=True) if len(x) == n: self.labels = x labels_loaded = True else: s = path.replace('images', 'labels') pbar = tqdm(self.label_files) for i, file in enumerate(pbar): if labels_loaded: l = self.labels[i] # np.savetxt(file, l, '%g') # save *.txt from *.npy file else: try: with open(file, 'r') as f: l = np.array( [x.split() for x in f.read().splitlines()], dtype=np.float32) except: nm += 1 # print('missing labels for image %s' % self.img_files[i]) # file missing continue if l.shape[0]: assert l.shape[1] == 5, '> 5 label columns: %s' % file assert (l >= 0).all(), 'negative labels: %s' % file assert (l[:, 1:] <= 1).all( ), 'non-normalized or out of bounds coordinate labels: %s' % file if np.unique(l, axis=0).shape[0] < l.shape[0]: # duplicate rows nd += 1 # print('WARNING: duplicate rows in %s' % self.label_files[i]) # duplicate rows if single_cls: l[:, 0] = 0 # force dataset into single-class mode self.labels[i] = l nf += 1 # file found # Create subdataset (a smaller dataset) if create_datasubset and ns < 1E4: if ns == 0: create_folder(path='./datasubset') os.makedirs('./datasubset/images') exclude_classes = 43 if exclude_classes not in l[:, 0]: ns += 1 # shutil.copy(src=self.img_files[i], dst='./datasubset/images/') # copy image with open('./datasubset/images.txt', 'a') as f: f.write(self.img_files[i] + '\n') # Extract object detection boxes for a second stage classifier if extract_bounding_boxes: p = Path(self.img_files[i]) img = cv2.imread(str(p)) h, w = img.shape[:2] for j, x in enumerate(l): f = '%s%sclassifier%s%g_%g_%s' % ( p.parent.parent, os.sep, os.sep, x[0], j, p.name) if not os.path.exists(Path(f).parent): os.makedirs( Path(f).parent) # make new output folder b = x[1:] * [w, h, w, h] # box b[2:] = b[2:].max() # rectangle to square b[2:] = b[2:] * 1.3 + 30 # pad b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image b[[1, 3]] = np.clip(b[[1, 3]], 0, h) assert cv2.imwrite(f, img[ b[1]:b[3], b[0]:b[2]]), 'Failure extracting classifier boxes' else: ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty # os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove pbar.desc = 'Caching labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % ( s, nf, nm, ne, nd, n) assert nf > 0 or n == 20288, 'No labels found in %s. See %s' % ( os.path.dirname(file) + os.sep, help_url) if not labels_loaded and n > 1000: print('Saving labels to %s for faster future loading' % np_labels_path) np.save(np_labels_path, self.labels) # save for next time # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) if cache_images: # if training gb = 0 # Gigabytes of cached images pbar = tqdm(range(len(self.img_files)), desc='Caching images') self.img_hw0, self.img_hw = [None] * n, [None] * n for i in pbar: # max 10k images self.imgs[i], self.img_hw0[i], self.img_hw[i] = load_image( self, i) # img, hw_original, hw_resized gb += self.imgs[i].nbytes pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9) # Detect corrupted images https://medium.com/joelthchao/programmatically-detect-corrupted-image-8c1b2006c3d3 detect_corrupted_images = False if detect_corrupted_images: from skimage import io # conda install -c conda-forge scikit-image for file in tqdm(self.img_files, desc='Detecting corrupted images'): try: _ = io.imread(file) except: print('Corrupted image detected: %s' % file)
def preprocess_img(self, img, crop_bb): crop = ut.xywh2xyxy(crop_bb) img = img.crop(crop) img = self.transform(img) return img
def val_model(self,val_dataloader, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5,): print("validating...") self.model.eval() cls_list = [] metrics_list = [] # list of tuples (tp,confs,pred) for batch_idx,(imgs,labels) in enumerate(tqdm.tqdm(val_dataloader,desc="Detecting objects")): # check the input_data format imgs = imgs.to(torch.float32) labels = labels.to(torch.float32) if self.cuda: imgs = imgs.to(self.device) labels = labels.to(self.device) # extract cls_name # labels: [detection_num,6]- 6:(1) img_id (corresponding to batch_idx) (2) cls_name (4) boxes # every item in outputs [detection_num,7] :(x1,y1,x2,y2,conf_score,cls_score,cls_pred) cls_list += labels[:,1].tolist() # rescale labels img_h = imgs.size(2) img_w = imgs.size(3) labels[:,2:] = xywh2xyxy(labels[:,2:]) labels[:,2] *= img_w labels[:,4] *= img_w labels[:,3] *= img_h labels[:,5] *= img_h # from xywh->xyxy with torch.no_grad(): outputs,_,__ = self.model(imgs) outputs = non_max_suppression(outputs,conf_thres,nms_thres) # before the ouputs are fed into compute_batch_info fcn # the outputs are supposed to be rescaled. # metrics_list: tp, pred_conf, pred_cls """ to check the compute_batch_info fcn we try to build a new outputs tensor from label to make. And in theory, you will get a prefect result. """ """ FloatTensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor fake_outputs = [None for idx in range(len(outputs))] for i in range(len(outputs)): label = labels[labels[:,0]==i] if len(label) > 0: fake_output = FloatTensor(np.zeros((len(label),7))) fake_output[:,:4] = label[:,2:6] fake_output[:,4] = 0.8 fake_output[:,5] = 0.8 fake_output[:,6] = label[:,1] fake_outputs[i] = fake_output outputs = fake_outputs """ metrics_list += compute_batch_info(outputs,labels,iou_thres) # for debug if batch_idx == 107: break # concatenate sample statistics tp,pred_conf,pred_cls = [np.concatenate(x,0) for x in list(zip(*metrics_list))] # print(tp.shape) # print(pred_conf.shape) # print(pred_cls.shape) # print(len(np.unique(pred_cls))) # a = input() precision,recall,ap,f1,ap_cls = ap_per_cls(tp,pred_conf,pred_cls,cls_list) # print(precision) # print(recall) # print(ap.shape) # print(f1) # print(ap_cls.shape) # a =input() self.model.train() return precision,recall,ap,f1,ap_cls
def detect(self, frame): ''' input: frame output: dst, box(xyxy(bbox) or 4x2(min_area)), center(xy) ''' if frame is None or frame is []: raise TypeError('No frame input') # Resize the frame shape = self.shape img0 = np.copy(frame) H, W, C = frame.shape scale_factor = np.array([shape[0]/W, shape[1]/H]) frame = cv2.resize(frame, shape) # 300,400,3 # Gaussian Blur frame_gaussian = cv2.GaussianBlur(frame, (7, 7), 0) # RGB to HSV frame_hsv = cv2.cvtColor(frame_gaussian, cv2.COLOR_BGR2HSV) # Get mask according to HSV hsv_thres_values = self.get_trackbar_value() if self.debug else list(self.icol) mask = cv2.inRange(frame_hsv, np.array(hsv_thres_values[:3]), np.array(hsv_thres_values[3:])) # Median filter mask_f = cv2.medianBlur(mask, 5) # Morphology for three times kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) mask_m = cv2.morphologyEx(mask_f, cv2.MORPH_CLOSE, kernel) mask_m = cv2.morphologyEx(mask_m, cv2.MORPH_OPEN, kernel) # Get Contours of The Mask box = None # xyxy center = None # xy _, contours, _= cv2.findContours(mask_m, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnt_list = [cnt for cnt in contours if self.contours_thres[0]<len(cnt)<self.contours_thres[1]] if cnt_list: cnt = max(contours, key=lambda x: x.shape[0]) # mutiple contours, choose the biggest one if self.box_type == 'bbox': # Get Bounding Box box = np.int0(scale_bbox(xywh2xyxy(cv2.boundingRect(cnt)), 1/scale_factor)) # xyxy center = np.int0(np.array([(box[0] + box[2])/2, (box[1] + box[3])/2])) elif self.box_type == 'min_area': # Get Minimum Area Box rect = cv2.minAreaRect(cnt) # center(x, y), (width, height), angle of rotation box = cv2.boxPoints(rect) # (4, 2) # scale box box = box / scale_factor center = np.sum(box, axis=0)/4 box, center = np.int0(box), np.int0(center) else: raise TypeError('unsupported box type %s' % self.box_type) # Result dst = self.plot_img(img0, box) # view result if self.view_result: show_img(self.window_name, cv2.resize(dst, shape)) # show_img(self.window_name, dst return dst, box, center
def evaluate(model, path, iou_thres, conf_thres, nms_thres, image_size, batch_size, num_workers, device): # 모델을 evaluation mode로 설정 model.eval() # 데이터셋, 데이터로더 설정 dataset = datasets.ListDataset(path, image_size, augment=False, multiscale=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=dataset.collate_fn) labels = [] sample_metrics = [] # List[Tuple] -> [(TP, confs, pred)] entire_time = 0 for _, images, targets in tqdm.tqdm(dataloader, desc='Evaluate method', leave=False): if targets is None: continue # Extract labels labels.extend(targets[:, 1].tolist()) # Rescale targets targets[:, 2:] = utils.xywh2xyxy(targets[:, 2:]) targets[:, 2:] *= image_size # Predict objects start_time = time.time() with torch.no_grad(): images = images.to(device) outputs = model(images) outputs = utils.NMS(outputs, conf_thres, nms_thres) entire_time += time.time() - start_time # Compute true positives, predicted scores and predicted labels per batch sample_metrics.extend( utils.get_batch_statistics(outputs, targets, iou_thres)) # Concatenate sample statistics if len(sample_metrics) == 0: true_positives, pred_scores, pred_labels = np.array([]), np.array( []), np.array([]) else: true_positives, pred_scores, pred_labels = [ np.concatenate(x, 0) for x in list(zip(*sample_metrics)) ] # Compute AP precision, recall, AP, f1, ap_class = utils.ap_per_class( true_positives, pred_scores, pred_labels, labels) # Compute inference time and fps inference_time = entire_time / dataset.__len__() fps = 1 / inference_time # Export inference time to miliseconds inference_time *= 1000 return precision, recall, AP, f1, ap_class, inference_time, fps
def forward(self, pred, target): ''' args: pred,预测得到的tensor,batch x self.S x self.S x self.pred_c target,将真实的标签进行了encode,batch x self.S x self.S x self.pred_c ''' # 得到batch数和pred所在的device N = target.size(0) device = pred.device # 得到匹配了gtbb的cell的坐标(这里是mask)和没有匹配gtbb的cell的坐标 coo_mask = target[..., 4] > 0 # batch x S x S noo_mask = target[..., 4] == 0 # batch x S x S # ----- 正样本部分 ----- # 把预测分成不同的部分 # batch*matched_cell_num x pred_c coo_pred = pred[coo_mask].view(-1, self.cell_channel) # batch*matched_cell_num*B x 5 box_pred = coo_pred[:, :self.no_class].reshape(-1, 5) # batch*matched_cell_num x C class_pred = coo_pred[:, self.no_class:] # 把标签分成不同的部分 # batch*matched_cell_num x pred_c coo_target = target[coo_mask].view(-1, self.cell_channel) # batch*matched_cell_num*B x 5 box_target = coo_target[:, :self.no_class].reshape(-1, 5) # batch*matched_cell_num x C class_target = coo_target[:, self.no_class:] # ----- 负样本部分 ----- noo_pred_c = pred[noo_mask].view(-1, self.cell_channel)[:, [4, 9]] noo_target_c = target[noo_mask].view(-1, self.cell_channel)[:, [4, 9]] # ----- 计算负样本部分 ----- noo_loss = self.mse(noo_pred_c, noo_target_c) # ----- 计算正样本部分 ----- coo_response_index = [] coo_not_response_index = [] boxes_target_iou = [] for i in range(0, box_target.size(0), self.B): # 因为我们是resize成[N, 5]的维度,所以每个cell的bbox有2个 box1 = box_pred[i:i + self.B] # target那里这两个bbox是一样的,所以考虑一个就好了 box2 = box_target[i].view(-1, 5) iou = boxes_iou(xywh2xyxy(box1[:, :4], self.cell_size), xywh2xyxy(box2[:, :4], self.cell_size)) # [2, 1] max_iou, max_index = iou.max(0) coo_response_index.append(i + max_index.item()) for bb in range(self.B): if bb != max_index: coo_not_response_index.append(i + bb) boxes_target_iou.append(max_iou) # 1. 正样本中有关confidencee的loss,包括两部分 # 因为一个cell会有多个bboxes,只用其中最高的那个bboxes对应confidence, # 其confidence=IoU*1,另外的bboxes一律对应0。 box_pred_response = box_pred[coo_response_index] box_pred_not_response = box_pred[coo_not_response_index] contain_loss = self.mse( box_pred_response[:, 4], torch.tensor(boxes_target_iou).to(device), ) not_contain_loss = self.mse( box_pred_not_response[:, 4], torch.zeros_like(box_pred_not_response[:, 4], device=device), ) # 2. loc loss # loc loss只使用对应上的bbox来计算(即2个bboxes中和gtbb最大的那个bbox) box_target_response = box_target[coo_response_index] loc_loss = self.mse( box_pred_response[:, :2], box_target_response[:, :2], ) + self.mse( box_pred_response[:, 2:4].sqrt(), box_target_response[:, 2:4].sqrt(), ) # 3. class loss # 显然class loss就使用有obj的cell的分类来计算的 class_loss = self.mse(class_pred, class_target) # ----- 所有的loss相加,并除以batch数 ----- # 为什么contain loss会乘以2????? all_loss = (self.l_coord * loc_loss + 2 * contain_loss + not_contain_loss + self.l_noobj * noo_loss + class_loss) / N return all_loss
def get_mosaic(self, n, cross_x, cross_y, tensor_img, boxes): t_height = tensor_img.shape[1] t_width = tensor_img.shape[2] xyxy_bboxes = utils.xywh2xyxy(boxes[:, 1:]) relative_cross_x = cross_x / self.img_size relative_cross_y = cross_y / self.img_size #CALCULATING TARGET WIDTH AND HEIGHT OF PICTURE if n == 0: width_of_nth_pic = cross_x height_of_nth_pic = cross_y elif n == 1: width_of_nth_pic = self.img_size - cross_x height_of_nth_pic = cross_y elif n == 2: width_of_nth_pic = cross_x height_of_nth_pic = self.img_size - cross_y elif n == 3: width_of_nth_pic = self.img_size - cross_x height_of_nth_pic = self.img_size - cross_y # self.img_size - width_of_1st_pic # selg.img_size - height_of_1st_pic # CHOOSING TOP LEFT CORNER (doing offset to have more than fex pixels in bbox :-) ) cut_x1 = random.randint(0, int(t_width * 0.33)) cut_y1 = random.randint(0, int(t_height * 0.33)) # Now we should find which axis should we randomly enlarge (this we do by finding out which ratio is bigger); cross x is basically width of the top left picture if (t_width - cut_x1) / width_of_nth_pic < ( t_height - cut_y1) / height_of_nth_pic: cut_x2 = random.randint(cut_x1 + int(t_width * 0.67), t_width) cut_y2 = int(cut_y1 + (cut_x2 - cut_x1) / width_of_nth_pic * height_of_nth_pic) else: cut_y2 = random.randint(cut_y1 + int(t_height * 0.67), t_height) cut_x2 = int(cut_x1 + (cut_y2 - cut_y1) / height_of_nth_pic * width_of_nth_pic) # RESIZING AND INSERTING (TO DO 2D interpolation wants 4 dimensions, so I add and remove one by using None and squeeze) tensor_img = F.interpolate( tensor_img[:, cut_y1:cut_y2, cut_x1:cut_x2][None], (height_of_nth_pic, width_of_nth_pic)).squeeze() # BBOX relative_cut_x1 = cut_x1 / t_width relative_cut_y1 = cut_y1 / t_height relative_cropped_width = (cut_x2 - cut_x1) / t_width relative_cropped_height = (cut_y2 - cut_y1) / t_height # SHIFTING TO CUTTED IMG SO X1 Y1 WILL 0 xyxy_bboxes[:, 0] = xyxy_bboxes[:, 0] - relative_cut_x1 xyxy_bboxes[:, 1] = xyxy_bboxes[:, 1] - relative_cut_y1 xyxy_bboxes[:, 2] = xyxy_bboxes[:, 2] - relative_cut_x1 xyxy_bboxes[:, 3] = xyxy_bboxes[:, 3] - relative_cut_y1 # RESIZING TO CUTTED IMG SO X2 WILL BE 1 xyxy_bboxes[:, 0] /= relative_cropped_width xyxy_bboxes[:, 1] /= relative_cropped_height xyxy_bboxes[:, 2] /= relative_cropped_width xyxy_bboxes[:, 3] /= relative_cropped_height # CLAMPING BOUNDING BOXES, SO THEY DO NOT OVERCOME OUTSIDE THE IMAGE xyxy_bboxes[:, 0].clamp_(0, 1) xyxy_bboxes[:, 1].clamp_(0, 1) xyxy_bboxes[:, 2].clamp_(0, 1) xyxy_bboxes[:, 3].clamp_(0, 1) # FILTER TO THROUGH OUT ALL SMALL BBOXES filter_minbbox = ( xyxy_bboxes[:, 2] - xyxy_bboxes[:, 0] > self.bbox_minsize) & ( xyxy_bboxes[:, 3] - xyxy_bboxes[:, 1] > self.bbox_minsize) # RESIZING TO MOSAIC if n == 0: xyxy_bboxes[:, 0] *= relative_cross_x # xyxy_bboxes[:, 1] *= relative_cross_y #(1 - relative_cross_y) xyxy_bboxes[:, 2] *= relative_cross_x # xyxy_bboxes[:, 3] *= relative_cross_y #(1 - relative_cross_y) elif n == 1: xyxy_bboxes[:, 0] *= (1 - relative_cross_x) xyxy_bboxes[:, 1] *= relative_cross_y xyxy_bboxes[:, 2] *= (1 - relative_cross_x) xyxy_bboxes[:, 3] *= relative_cross_y elif n == 2: xyxy_bboxes[:, 0] *= relative_cross_x xyxy_bboxes[:, 1] *= (1 - relative_cross_y) xyxy_bboxes[:, 2] *= relative_cross_x xyxy_bboxes[:, 3] *= (1 - relative_cross_y) elif n == 3: xyxy_bboxes[:, 0] *= (1 - relative_cross_x) xyxy_bboxes[:, 1] *= (1 - relative_cross_y) xyxy_bboxes[:, 2] *= (1 - relative_cross_x) xyxy_bboxes[:, 3] *= (1 - relative_cross_y) # RESIZING TO MOSAIC if n == 0: xyxy_bboxes[:, 0] = xyxy_bboxes[:, 0] # + relative_cross_x xyxy_bboxes[:, 1] = xyxy_bboxes[:, 1] # + relative_cross_y xyxy_bboxes[:, 2] = xyxy_bboxes[:, 2] # + relative_cross_x xyxy_bboxes[:, 3] = xyxy_bboxes[:, 3] # + relative_cross_y elif n == 1: xyxy_bboxes[:, 0] = xyxy_bboxes[:, 0] + relative_cross_x xyxy_bboxes[:, 1] = xyxy_bboxes[:, 1] xyxy_bboxes[:, 2] = xyxy_bboxes[:, 2] + relative_cross_x xyxy_bboxes[:, 3] = xyxy_bboxes[:, 3] elif n == 2: xyxy_bboxes[:, 0] = xyxy_bboxes[:, 0] xyxy_bboxes[:, 1] = xyxy_bboxes[:, 1] + relative_cross_y xyxy_bboxes[:, 2] = xyxy_bboxes[:, 2] xyxy_bboxes[:, 3] = xyxy_bboxes[:, 3] + relative_cross_y elif n == 3: xyxy_bboxes[:, 0] = xyxy_bboxes[:, 0] + relative_cross_x xyxy_bboxes[:, 1] = xyxy_bboxes[:, 1] + relative_cross_y xyxy_bboxes[:, 2] = xyxy_bboxes[:, 2] + relative_cross_x xyxy_bboxes[:, 3] = xyxy_bboxes[:, 3] + relative_cross_y boxes = boxes[filter_minbbox] boxes[:, 1:] = utils.xyxy2xywh(xyxy_bboxes)[filter_minbbox] return tensor_img, boxes
def forward(self, x, y_true=None): """ Transform feature map into 2-D tensor. Transformation includes 1. Re-organize tensor to make each row correspond to a bbox 2. Transform center coordinates bx = sigmoid(tx) + cx by = sigmoid(ty) + cy 3. Transform width and height bw = pw * exp(tw) bh = ph * exp(th) 4. Activation @Args x: (Tensor) feature map with size [bs, (5+nC)*nA, gs, gs] 5 => [4 offsets (xc, yc, w, h), objectness] @Returns detections: (Tensor) feature map with size [bs, nA, gs, gs, 5+nC] """ bs, _, gs, _ = x.size() stride = self.reso // gs # no pooling used, stride is the only downsample num_attrs = 5 + self.num_classes # tx, ty, tw, th, p0 nA = len(self.anchors) scaled_anchors = torch.Tensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]).cuda() # Re-organize [bs, (5+nC)*nA, gs, gs] => [bs, nA, gs, gs, 5+nC] x = x.view(bs, nA, num_attrs, gs, gs).permute(0, 1, 3, 4, 2).contiguous() pred = torch.Tensor(bs, nA, gs, gs, num_attrs).cuda() pred_tx = torch.sigmoid(x[..., 0]).cuda() pred_ty = torch.sigmoid(x[..., 1]).cuda() pred_tw = x[..., 2].cuda() pred_th = x[..., 3].cuda() pred_conf = torch.sigmoid(x[..., 4]).cuda() if self.training == True: pred_cls = x[..., 5:].cuda() # softmax in cross entropy else: pred_cls = F.softmax(x[..., 5:], dim=-1).cuda() # class grid_x = torch.arange(gs).repeat(gs, 1).view([1, 1, gs, gs]).float().cuda() grid_y = torch.arange(gs).repeat(gs, 1).t().view([1, 1, gs, gs]).float().cuda() anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) pred[..., 0] = pred_tx + grid_x pred[..., 1] = pred_ty + grid_y pred[..., 2] = torch.exp(pred_tw) * anchor_w pred[..., 3] = torch.exp(pred_th) * anchor_h pred[..., 4] = pred_conf pred[..., 5:] = pred_cls if not self.training: pred[..., :4] *= stride return pred.view(bs, -1, num_attrs) else: gt_tx = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() gt_ty = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() gt_tw = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() gt_th = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() gt_conf = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() gt_cls = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() obj_mask = torch.zeros(bs, nA, gs, gs, requires_grad=False).cuda() for idx in range(bs): for y_true_one in y_true[idx]: y_true_one = y_true_one.cuda() gt_bbox = y_true_one[:4] * gs gt_cls_label = int(y_true_one[4]) gt_xc, gt_yc, gt_w, gt_h = gt_bbox[0:4] gt_i = gt_xc.long().cuda() gt_j = gt_yc.long().cuda() pred_bbox = pred[idx, :, gt_j, gt_i, :4] ious = IoU(xywh2xyxy(pred_bbox), xywh2xyxy(gt_bbox)) best_iou, best_a = torch.max(ious, 0) w, h = scaled_anchors[best_a] gt_tw[idx, best_a, gt_j, gt_i] = torch.log(gt_w / w) gt_th[idx, best_a, gt_j, gt_i] = torch.log(gt_h / h) gt_tx[idx, best_a, gt_j, gt_i] = gt_xc - gt_i.float() gt_ty[idx, best_a, gt_j, gt_i] = gt_yc - gt_j.float() gt_conf[idx, best_a, gt_j, gt_i] = best_iou gt_cls[idx, best_a, gt_j, gt_i] = gt_cls_label obj_mask[idx, best_a, gt_j, gt_i] = 1 MSELoss = nn.MSELoss(reduction='sum') BCELoss = nn.BCELoss(reduction='sum') CELoss = nn.CrossEntropyLoss(reduction='sum') loss = dict() loss['x'] = MSELoss(pred_tx * obj_mask, gt_tx * obj_mask) loss['y'] = MSELoss(pred_ty * obj_mask, gt_ty * obj_mask) loss['w'] = MSELoss(pred_tw * obj_mask, gt_tw * obj_mask) loss['h'] = MSELoss(pred_th * obj_mask, gt_th * obj_mask) # loss['cls'] = BCELoss(pred_cls * obj_mask, cls_mask * obj_mask) loss['cls'] = CELoss( (pred_cls * obj_mask.unsqueeze(-1)).view(-1, self.num_classes), (gt_cls * obj_mask).view(-1).long()) loss['conf'] = MSELoss(pred_conf * obj_mask * 5, gt_conf * obj_mask * 5) + \ MSELoss(pred_conf * (1 - obj_mask), pred_conf * (1 - obj_mask)) pprint(loss) return loss
print('\n---- Evaluating Model ----') # Evaluate the model on the validation set model.eval() labels = [] sample_metrics = [] # List of tuples (TP, confs, pred) for ind, (imgs, targets) in enumerate(val_loader): imgs = imgs.to(device) targets = targets.to(device) # Extract labels labels += targets[:, 1].tolist() # Rescale target targets[:, 2:] = utils.xywh2xyxy(targets[:, 2:]) targets[:, 2:] *= args.img_size with torch.no_grad(): outputs, _ = model(imgs) outputs = utils.non_max_suppression( outputs, conf_thresh=args.conf_thresh, nms_thresh=args.nms_thresh) sample_metrics += utils.get_batch_statistics( outputs, targets, iou_thresh=args.map_thresh) if len(sample_metrics) == 0: print('---- mAP is NULL') else:
def __getitem__(self, index): img_path = self.img_files[index % len(self.img_files)].rstrip() label_path = self.label_files[index % len(self.img_files)].rstrip() # Getting image img = Image.open(img_path).convert('RGB') width, height = img.size if os.path.exists(label_path): boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5)) # RESIZING if width > height: ratio = height / width t_width = self.img_size t_height = int(ratio * self.img_size) else: ratio = width / height t_width = int(ratio * self.img_size) t_height = self.img_size img = transforms.functional.resize(img, (t_height, t_width)) # IF TRAIN APPLY BRIGHTNESS CONTRAST HUE SATURTATION if self.train: brightness_rnd = random.uniform(1 - self.brightness_range, 1 + self.brightness_range) contrast_rnd = random.uniform(1 - self.contrast_range, 1 + self.contrast_range) hue_rnd = random.uniform(-self.hue_range, self.hue_range) saturation_rnd = random.uniform(1 - self.saturation_range, 1 + self.saturation_range) img = transforms.functional.adjust_brightness(img, brightness_rnd) img = transforms.functional.adjust_contrast(img, contrast_rnd) img = transforms.functional.adjust_hue(img, hue_rnd) img = transforms.functional.adjust_saturation(img, saturation_rnd) # CONVERTING TO TENSOR tensor_img = transforms.functional.to_tensor(img) # Handle grayscaled images if len(tensor_img.shape) != 3: tensor_img = tensor_img.unsqueeze(0) tensor_img = tensor_img.expand((3, img.shape[1:])) # !!!WARNING IN PIL IT'S WIDTH HEIGHT, WHEN IN PYTORCH IT IS HEIGHT WIDTH # Apply augmentations for train it would be mosaic if self.train: mossaic_img = torch.zeros(3, self.img_size, self.img_size) # FINDING CROSS POINT cross_x = int( random.uniform(self.img_size * self.cross_offset, self.img_size * (1 - self.cross_offset))) cross_y = int( random.uniform(self.img_size * self.cross_offset, self.img_size * (1 - self.cross_offset))) fragment_img, fragment_bbox = self.get_mosaic( 0, cross_x, cross_y, tensor_img, boxes) mossaic_img[:, 0:cross_y, 0:cross_x] = fragment_img boxes = fragment_bbox for n in range(1, 4): raw_fragment_img, raw_fragment_bbox = self.get_img_for_mosaic( brightness_rnd, contrast_rnd, hue_rnd, saturation_rnd) fragment_img, fragment_bbox = self.get_mosaic( n, cross_x, cross_y, raw_fragment_img, raw_fragment_bbox) boxes = torch.cat([boxes, fragment_bbox]) if n == 1: mossaic_img[:, 0:cross_y, cross_x:self.img_size] = fragment_img elif n == 2: mossaic_img[:, cross_y:self.img_size, 0:cross_x] = fragment_img elif n == 3: mossaic_img[:, cross_y:self.img_size, cross_x:self.img_size] = fragment_img #Set mossaic to return tensor tensor_img = mossaic_img # For validation it would be letterbox else: xyxy_bboxes = utils.xywh2xyxy(boxes[:, 1:]) #IMG padding = abs((t_width - t_height)) // 2 padded_img = torch.zeros(3, self.img_size, self.img_size) if t_width > t_height: padded_img[:, padding:padding + t_height] = tensor_img else: padded_img[:, :, padding:padding + t_width] = tensor_img tensor_img = padded_img relative_padding = padding / self.img_size #BOXES if t_width > t_height: #Change y's relative position xyxy_bboxes[:, 1] *= ratio xyxy_bboxes[:, 3] *= ratio xyxy_bboxes[:, 1] += relative_padding xyxy_bboxes[:, 3] += relative_padding else: #x's xyxy_bboxes[:, 0] *= ratio xyxy_bboxes[:, 2] *= ratio xyxy_bboxes[:, 0] += relative_padding xyxy_bboxes[:, 2] += relative_padding boxes[:, 1:] = utils.xyxy2xywh(xyxy_bboxes) targets = torch.zeros((len(boxes), 6)) targets[:, 1:] = boxes return img_path, tensor_img, targets
def process_bboxes(self, predictions, image_info, confidence_threshold=0.01, overlap_threshold=0.5, nms=True): image_idx_ = [] bboxes_ = [] classes_ = [] conf_ = [] for i, predictions_ in enumerate(predictions): if i not in [ 0, 1, 2 ]: # Use this for specifying only a subset of detectors continue predictions_ = predictions_.permute(0, 2, 3, 1) for j, prediction in enumerate(predictions_): prediction = prediction.contiguous().view( -1, self.num_features) prediction[:, 5:] = F.softmax(prediction[:, 5:], dim=-1) classes = torch.argmax(prediction[:, 5:], dim=-1) idx = torch.arange(0, len(prediction)) confidence = prediction[:, 4] * prediction[idx, 5 + classes] mask = confidence > confidence_threshold if sum(mask) == 0: continue bboxes = prediction[mask, :4].clone() bboxes[:, ::2] *= self.strides[i] bboxes[:, 1::2] *= self.strides[i] bboxes = xywh2xyxy(bboxes) confidence = confidence[mask] classes = classes[mask] bboxes[:, ::2] = torch.clamp( bboxes[:, ::2], min=image_info['padding'][0][j] + 1, max=self.image_size[0] - image_info['padding'][2][j]) bboxes[:, 1::2] = torch.clamp( bboxes[:, 1::2], min=image_info['padding'][1][j] + 1, max=self.image_size[1] - image_info['padding'][3][j]) image_idx_.append(j) bboxes_.append(bboxes) classes_.append(classes) conf_.append(confidence) bboxes_ = \ [torch.cat([bboxes_[ii] for ii, k in enumerate(image_idx_) if k == idx]) for idx in np.unique(image_idx_)] classes_ = \ [torch.cat([classes_[ii] for ii, k in enumerate(image_idx_) if k == idx]) for idx in np.unique(image_idx_)] conf_ = \ [torch.cat([conf_[ii] for ii, k in enumerate(image_idx_) if k == idx]) for idx in np.unique(image_idx_)] image_idx = [] bboxes = [] confidence = [] classes = [] for i, idx in enumerate(np.unique(image_idx_)): if nms: cls = torch.unique(classes_[i]) for c in cls: cls_mask = (classes_[i] == c).nonzero().flatten() mask = non_maximum_suppression(bboxes_[i][cls_mask], conf_[i][cls_mask], overlap=overlap_threshold) bboxes.append(bboxes_[i][cls_mask][mask]) classes.append(classes_[i][cls_mask][mask]) confidence.append(conf_[i][cls_mask][mask]) image_idx.append([image_info['id'][idx]] * len(bboxes_[i][cls_mask][mask])) else: bboxes.append(bboxes_[i]) confidence.append(conf_[i]) classes.append(classes_[i]) image_idx.append([image_info['id'][idx]] * len(bboxes_[i])) if len(bboxes) > 0: bboxes = torch.cat(bboxes).view(-1, 4) classes = torch.cat(classes).flatten() confidence = torch.cat(confidence).flatten() image_idx = [item for sublist in image_idx for item in sublist] return bboxes, classes, confidence, image_idx else: return torch.tensor([], device=self.device), \ torch.tensor([], dtype=torch.long, device=self.device), \ torch.tensor([], device=self.device), \ []