def track_vot(model, video, hp=None, mask_enable=False, refine_enable=False, device='cpu'): regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0 for f, image_file in enumerate(image_files): im = cv2.imread(image_file) tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = siamese_init(im, target_pos, target_sz, model, hp, device) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking state = siamese_track(state, im, mask_enable, refine_enable, device, args.debug) # track if mask_enable: location = state['ploygon'].flatten() mask = state['mask'] else: location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) mask = [] if 'VOT' in args.dataset: gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]), (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7])) if mask_enable: pred_polygon = ((location[0], location[1]), (location[2], location[3]), (location[4], location[5]), (location[6], location[7])) else: pred_polygon = ((location[0], location[1]), (location[0] + location[2], location[1]), (location[0] + location[2], location[1] + location[3]), (location[0], location[1] + location[3])) b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0])) else: b_overlap = 1 if b_overlap: regions.append(location) else: # lost regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append(0) toc += cv2.getTickCount() - tic if args.visualization and f >= start_frame: # visualization (skip lost frame) im_show = im.copy() if f == 0: cv2.destroyAllWindows() if gt.shape[0] > f: if len(gt[f]) == 8: cv2.polylines( im_show, [np.array(gt[f], np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) else: cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3) if len(location) == 8: if mask_enable: mask = mask > state['p'].seg_thr im_show[:, :, 2] = mask * 255 + (1 - mask) * im_show[:, :, 2] location_int = np.int0(location) cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: location = [int(l) for l in location] cv2.rectangle( im_show, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(im_show, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.putText(im_show, str(state['score']) if 'score' in state else '', (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video['name'], im_show) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save result name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\ args.resume.split('/')[-1].split('.')[0] if 'VOT' in args.dataset: video_path = join('test', args.dataset, name, 'baseline', video['name']) if not isdir(video_path): makedirs(video_path) result_path = join(video_path, '{:s}_001.txt'.format(video['name'])) with open(result_path, "w") as fin: for x in regions: fin.write("{:d}\n".format(x)) if isinstance(x, int) else \ fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') else: # OTB video_path = join('test', args.dataset, name) if not isdir(video_path): makedirs(video_path) result_path = join(video_path, '{:s}.txt'.format(video['name'])) with open(result_path, "w") as fin: for x in regions: fin.write(','.join([str(i) for i in x]) + '\n') logger.info( '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'. format(v_id, video['name'], toc, f / toc, lost_times)) return lost_times, f / toc
def tune(param): regions = [] # result and states[1 init / 2 lost / 0 skip] # save result benchmark_result_path = join('result', param['dataset']) tracker_path = join(benchmark_result_path, (param['network_name'] + '_r{}'.format(param['hp']['instance_size']) + '_penalty_k_{:.3f}'.format(param['hp']['penalty_k']) + '_window_influence_{:.3f}'.format(param['hp']['window_influence']) + '_lr_{:.3f}'.format(param['hp']['lr'])).replace('.', '_')) # no . if param['dataset'].startswith('VOT'): baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, param['video']) result_path = join(video_path, param['video'] + '_001.txt') elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'): video_path = tracker_path result_path = join(video_path, param['video']+'.txt') if isfile(result_path): return try: if not isdir(video_path): makedirs(video_path) except OSError as err: print(err) with open(result_path, 'w') as f: # Occupation f.write('Occ') global ims, gt, image_files if ims is None: print(param['video'] + ' Only load image once and if needed') ims = [cv2.imread(x) for x in image_files] start_frame, lost_times, toc = 0, 0, 0 for f, im in enumerate(ims): tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = siamese_init(im, target_pos, target_sz, param['network'], param['hp'], device=device) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) if param['dataset'].startswith('VOT'): regions.append(1) elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'): regions.append(gt[f]) elif f > start_frame: # tracking state = siamese_track(state, im, args.mask, args.refine, device=device) if args.mask: location = state['ploygon'].flatten() else: location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) if param['dataset'].startswith('VOT'): if 'VOT' in args.dataset: gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]), (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7])) if args.mask: pred_polygon = ((location[0], location[1]), (location[2], location[3]), (location[4], location[5]), (location[6], location[7])) else: pred_polygon = ((location[0], location[1]), (location[0] + location[2], location[1]), (location[0] + location[2], location[1] + location[3]), (location[0], location[1] + location[3])) b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0])) else: b_overlap = 1 if b_overlap: # continue to track regions.append(location) else: # lost regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames else: regions.append(location) else: # skip regions.append(0) toc += cv2.getTickCount() - tic if args.visualization and f >= start_frame: # visualization (skip lost frame) if f == 0: cv2.destroyAllWindows() if len(gt[f]) == 8: cv2.polylines(im, [np.array(gt[f], np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) else: cv2.rectangle(im, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3) if len(location) == 8: location = np.int0(location) cv2.polylines(im, [location.reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: location = [int(l) for l in location] # bad support for OPENCV cv2.rectangle(im, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) # frame id cv2.putText(im, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) # lost time cv2.imshow(param['video'], im) cv2.waitKey(1) toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(param['video'], toc, f / toc, lost_times)) with open(result_path, 'w') as f: for x in regions: f.write('{:d}\n'.format(x)) if isinstance(x, int) else \ f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
def siamese_track(state, im, mask_enable=False, refine_enable=False, device='cpu', debug=False): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_x = target_sz[1] + p.context_amount * sum(target_sz) hc_x = target_sz[0] + p.context_amount * sum(target_sz) s_x = np.sqrt(wc_x * hc_x) scale_x = p.exemplar_size / s_x d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_x s_x = s_x + 2 * pad crop_box = [ target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x) ] if debug: im_debug = im.copy() crop_box_int = np.int0(crop_box) cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]), (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2) cv2.imshow('search area', im_debug) cv2.waitKey(0) # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) if mask_enable: score, delta, mask = net.track_mask(x_crop.to(device)) else: score, delta = net.track(x_crop.to(device)) delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty target_sz_in_crop = target_sz * scale_x s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) pscore = penalty * score # cos window (motion model) pscore = pscore * (1 - p.window_influence) + window * p.window_influence best_pscore_id = np.argmax(pscore) pred_in_crop = delta[:, best_pscore_id] / scale_x lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr # lr for OTB res_x = pred_in_crop[0] + target_pos[0] res_y = pred_in_crop[1] + target_pos[1] res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr target_pos = np.array([res_x, res_y]) target_sz = np.array([res_w, res_h]) # for Mask Branch if mask_enable: best_pscore_id_mask = np.unravel_index(best_pscore_id, (5, p.score_size, p.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] if refine_enable: mask = net.track_refine( (delta_y, delta_x)).to(device).sigmoid().squeeze().view( p.out_size, p.out_size).cpu().data.numpy() else: mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop s = crop_box[2] / p.instance_size sub_box = [ crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, s * p.exemplar_size, s * p.exemplar_size ] s = p.out_size / sub_box[2] back_box = [ -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s ] mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h'])) target_mask = (mask_in_img > p.seg_thr).astype(np.uint8) if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax(cnt_area)] # use max area polygon polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints( cv2.minAreaRect(polygon)) # Rotated Rectangle # box_in_img = pbox rbox_in_img = prbox else: # empty mask location = cxy_wh_2_rect(target_pos, target_sz) rbox_in_img = np.array( [[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score[best_pscore_id] state['mask'] = mask_in_img if mask_enable else [] state['ploygon'] = rbox_in_img if mask_enable else [] return state
def tune(param): regions = [] # result and states[1 init / 2 lost / 0 skip] # save result benchmark_result_path = join('result', param['dataset']) tracker_path = join( benchmark_result_path, (param['network_name'] + ('_refine' if args.refine else '') + '_r{}'.format(param['hp']['instance_size']) + '_penalty_k_{:.3f}'.format(param['hp']['penalty_k']) + '_window_influence_{:.3f}'.format(param['hp']['window_influence']) + '_lr_{:.3f}'.format(param['hp']['lr'])).replace('.', '_')) # no . video_path = tracker_path result_path = join(video_path, param['video'] + '.txt') if isfile(result_path): return try: if not isdir(video_path): makedirs(video_path) except OSError as err: print(err) with open(result_path, 'w') as f: # Occupation f.write('Occ') global ims, gt, annos, image_files, anno_files if ims is None: print(param['video'] + ' Only load image once and if needed') ims = [cv2.imread(x) for x in image_files] annos = [np.array(Image.open(x)) for x in anno_files] iou = IouMeter(thrs, len(ims) - 2) start_frame, end_frame, toc = 0, len(ims) - 1, 0 for f, (im, anno) in enumerate(zip(ims, annos)): tic = cv2.getTickCount() if f == start_frame: # init target_pos = np.array( [gt[f, 0] + gt[f, 2] / 2, gt[f, 1] + gt[f, 3] / 2]) target_sz = np.array([gt[f, 2], gt[f, 3]]) state = siamese_init(im, target_pos, target_sz, param['network'], param['hp']) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(gt[f]) elif f > start_frame: # tracking state = siamese_track(state, im, args.mask, args.refine) # track location = state['ploygon'].flatten() mask = state['mask'] regions.append(location) if start_frame < f < end_frame: iou.add(mask, anno) toc += cv2.getTickCount() - tic if args.visualization and f >= start_frame: # visualization (skip lost frame) im_show = im.copy() if f == 0: cv2.destroyAllWindows() if len(gt[f]) == 8: cv2.polylines(im_show, [np.array(gt[f], np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) else: cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3) if len(location) == 8: im_show[:, :, 2] = mask * 255 + (1 - mask) * im_show[:, :, 2] cv2.polylines(im_show, [np.int0(location).reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: location = [int(l) for l in location] # bad support for OPENCV cv2.rectangle( im_show, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) # frame id cv2.imshow(param['video'], im_show) cv2.waitKey(1) toc /= cv2.getTickFrequency() iou_list = iou.value('mean') print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps IOU: {:.3f}'.format( param['video'], toc, f / toc, iou_list.max())) with open(result_path, 'w') as f: f.write(','.join(["%.5f" % i for i in iou_list]) + '\n') return iou_list
def siamese_track(state, im): refine_enable = True mask_enable = True device = 'cpu' debug = True p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] detector = state["detector"] custom_objects = detector.CustomObjects(car=True, person=True) targets = state["targets"] zf_lists = [] # s_z = [ round(np.sqrt(target["target_sz"][1] + 0.123 * sum(target["target_sz"])*target["target_sz"][0] + 0.123 * sum(target["target_sz"]) )) for target in targets ] # s_z = np.array(s_z) # scale_x = p.exemplar_size / s_z # d_search = (p.instance_size - p.exemplar_size) / 2 BLUE = [255, 255, 255] for i, target in enumerate(targets): wc_x = target["target_sz"][1] + p.context_amount * sum( target["target_sz"]) hc_x = target["target_sz"][0] + p.context_amount * sum( target["target_sz"]) target["s_z"] = np.sqrt(wc_x * hc_x) target["scale_x"] = p.exemplar_size / target["s_z"] d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / target["scale_x"] target["s_z"] = target["s_z"] + 2 * pad target["crop_box"] = [ target["target_pos"][0] - round(target["s_z"]) / 2, target["target_pos"][1] - round(target["s_z"]) / 2, round(target["s_z"]), round(target["s_z"]) ] zf_lists.append(target["zf"]) crop_box = target["crop_box"] # if debug: # im_debug = im.copy() # crop_box_int = np.int0(crop_box) # cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]), # (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2) # cv2.imshow('search area', im_debug) # cv2.waitKey(1) # extract scaled crops for search region x at previous target position targets = get_subwindow_tracking(im, p.instance_size, avg_chans, targets=targets) # x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) tracking_data_list = [] tracking_data = dict() for target, zf in zip(targets, zf_lists): target["x_crop"] = Variable(target["im_to_torch"].unsqueeze(0)) target["x_crop"] = target["x_crop"].to(device) tracking_data_list.append({"x_crop": target["x_crop"], "zf": zf}) if mask_enable: results = net.track_mask(search=targets[0]["x_crop"], lists=tracking_data_list) # score, delta, mask = net.track_mask(search=targets[0]["x_crop"],lists=tracking_data_list) else: score, delta = net.track(x_crop.to(device)) for result in results: delta = result["rpn_pred_loc"] score = result["rpn_pred_cls"] delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] result["rpn_pred_loc"] = delta result["rpn_pred_cls"] = score def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty count = 0 for target, result in zip(targets, results): delta = result["rpn_pred_loc"] score = result["rpn_pred_cls"] crop_box = target["crop_box"] target_sz_in_crop = target["target_sz"] * target["scale_x"] s_c = change( sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) pscore = penalty * score pscore = pscore * (1 - p.window_influence) + window * p.window_influence best_pscore_id = np.argmax(pscore) pred_in_crop = delta[:, best_pscore_id] / target["scale_x"] lr = penalty[best_pscore_id] * score[ best_pscore_id] * p.lr # lr for OTB res_x = pred_in_crop[0] + target["target_pos"][0] res_y = pred_in_crop[1] + target["target_pos"][1] res_w = target["target_sz"][0] * (1 - lr) + pred_in_crop[2] * lr res_h = target["target_sz"][1] * (1 - lr) + pred_in_crop[3] * lr target["target_pos"] = np.array([res_x, res_y]) target["target_sz"] = np.array([res_w, res_h]) if mask_enable: best_pscore_id_mask = np.unravel_index( best_pscore_id, (5, p.score_size, p.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] if refine_enable: mask = net.track_refine( (delta_y, delta_x), index=count).to(device).sigmoid().squeeze().view( p.out_size, p.out_size).cpu().data.numpy() else: mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() count += 1 def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop s = crop_box[2] / p.instance_size sub_box = [ crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, s * p.exemplar_size, s * p.exemplar_size ] s = p.out_size / sub_box[2] back_box = [ -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s ] mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h'])) target_mask = (mask_in_img > p.seg_thr).astype(np.uint8) if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax(cnt_area)] # use max area polygon polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints( cv2.minAreaRect(polygon)) # Rotated Rectangle # box_in_img = pbox rbox_in_img = prbox else: # empty mask location = cxy_wh_2_rect(target["target_pos"], target["target_sz"]) rbox_in_img = np.array( [[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) target["target_pos"][0] = max( 0, min(state['im_w'], target["target_pos"][0])) target["target_pos"][1] = max( 0, min(state['im_h'], target["target_pos"][1])) target["target_sz"][0] = max( 10, min(state['im_w'], target["target_sz"][0])) target["target_sz"][1] = max( 10, min(state['im_h'], target["target_sz"][1])) # print("new targetPos {} and targetsize {} \n".format(target["target_pos"],target["target_sz"])) target["mask"] = mask_in_img if mask_enable else [] target['ploygon'] = rbox_in_img if mask_enable else [] target["score"] = score[best_pscore_id] state["targets"] = targets # state['target_pos'] = target_pos # state['target_sz'] = target_sz # state['score'] = score[best_pscore_id] # state['mask'] = mask_in_img if mask_enable else [] # state['ploygon'] = rbox_in_img if mask_enable else [] return state
def track_vot(model, video, hp=None, mask_enable=False, refine_enable=False, device='cpu'): """ 对目标进行追踪 :param model: 训练好的模型 :param video: 视频数据 :param hp: 超参数 :param mask_enable: 是否生成掩膜,默认为False :param refine_enable: 是否使用融合后的模型 :param device:硬件信息 :return:目标跟丢次数,fps """ # 记录目标框及其状态 regions = [] # result and states[1 init / 2 lost / 0 skip] # 获取要处理的图像,和真实值groundtruth image_files, gt = video['image_files'], video['gt'] # 设置相关参数:初始帧,终止帧,目标丢失次数,toc start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0 # 遍历要处理的图像 for f, image_file in enumerate(image_files): # 读取图像 im = cv2.imread(image_file) tic = cv2.getTickCount() # 若为初始帧图像 if f == start_frame: # init # 获取目标区域的位置:中心点坐标,宽,高 cx, cy, w, h = get_axis_aligned_bbox(gt[f]) # 目标位置 target_pos = np.array([cx, cy]) # 目标大小 target_sz = np.array([w, h]) # 初始化跟踪器 state = siamese_init(im, target_pos, target_sz, model, hp, device) # init tracker # 将目标框转换为:左上角坐标,宽,高的形式 location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) # 若数据集是VOT,在regions中添加1,否则添加gt[f],第一帧目标的真实位置 regions.append(1 if 'VOT' in args.dataset else gt[f]) # 非初始帧数据 elif f > start_frame: # tracking # 进行目标追踪 state = siamese_track(state, im, mask_enable, refine_enable, device, args.debug) # track # 若进行掩膜处理 if mask_enable: # 将跟踪结果铺展开 location = state['ploygon'].flatten() # 获得掩码 mask = state['mask'] # 不进行掩膜处理 else: # 将目标框表示形式转换为:左上角坐标,宽,高的形式 location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) # 掩膜为空 mask = [] # 如果是VOT数据,计算交叠程度,其他数据默认交叠为1 if 'VOT' in args.dataset: # 目标的真实位置 gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]), (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7])) # 若进行掩膜处理 if mask_enable: # 预测结果为: pred_polygon = ((location[0], location[1]), (location[2], location[3]), (location[4], location[5]), (location[6], location[7])) # 若不进行掩膜 else: # 预测结果为: pred_polygon = ((location[0], location[1]), (location[0] + location[2], location[1]), (location[0] + location[2], location[1] + location[3]), (location[0], location[1] + location[3])) # 计算两个目标之间的交叠程度 b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0])) else: b_overlap = 1 # 如果跟踪框和真实框有交叠,添加跟踪结果中 if b_overlap: regions.append(location) # 如果跟丢,则记录跟丢次数,五帧后重新进行目标初始化 else: # lost regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames # 其他帧数据跳过(比如小于初始帧的数据) else: # skip regions.append(0) # 计算跟踪时间 toc += cv2.getTickCount() - tic # 如果进行显示并且跳过丢失的帧数据 if args.visualization and f >= start_frame: # visualization (skip lost frame) # 复制原图像的副本 im_show = im.copy() # 如果帧数为0,销毁窗口 if f == 0: cv2.destroyAllWindows() # 标注信息中包含第f帧的结果时: if gt.shape[0] > f: # 将标准的真实信息绘制在图像上 if len(gt[f]) == 8: cv2.polylines( im_show, [np.array(gt[f], np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) else: cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3) # 将跟踪结果绘制在图像上 if len(location) == 8: # 若进行掩膜处理,将掩膜结果绘制在图像上 if mask_enable: mask = mask > state['p'].seg_thr im_show[:, :, 2] = mask * 255 + (1 - mask) * im_show[:, :, 2] location_int = np.int0(location) cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: location = [int(l) for l in location] cv2.rectangle( im_show, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(im_show, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.putText(im_show, str(state['score']) if 'score' in state else '', (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video['name'], im_show) cv2.waitKey(1) toc /= cv2.getTickFrequency() # 结果保存到文本文件中 save result # 文件夹名称:包括模型结构、mask、refine、resume信息 name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\ args.resume.split('/')[-1].split('.')[0] # 如果是VOT数据集 if 'VOT' in args.dataset: # 构建追踪结果存储位置 video_path = join('test', args.dataset, name, 'baseline', video['name']) # 若不存在该路径,进行创建 if not isdir(video_path): makedirs(video_path) # 文本文件的路径 result_path = join(video_path, '{:s}_001.txt'.format(video['name'])) # 将追踪结果写入文本文件中 # with open(result_path, "w") as fin: # for x in regions: # fin.write("{:d}\n".format(x)) if isinstance(x, int) else \ # fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') # 如果是OTB数据 else: # OTB # 构建存储路径 video_path = join('test', args.dataset, name) # 若不存在该路径,进行创建 if not isdir(video_path): makedirs(video_path) # 文本文件的路径 result_path = join(video_path, '{:s}.txt'.format(video['name'])) # 将追踪结果写入文本文件中 with open(result_path, "w") as fin: for x in regions: fin.write(','.join([str(i) for i in x]) + '\n') # 将信息写入到log文件中 logger.info( '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'. format(v_id, video['name'], toc, f / toc, lost_times)) # 返回结果 return lost_times, f / toc
def siamese_track(state, im, mask_enable=False, refine_enable=False, device='cpu', debug=False): global arrendatario p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] print(im.shape) wc_x = target_sz[1] + p.context_amount * sum(target_sz) hc_x = target_sz[0] + p.context_amount * sum(target_sz) s_x = np.sqrt(wc_x * hc_x) scale_x = p.exemplar_size / s_x # p.exemplar_size = 127, sempre es la mateixa d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_x s_x = s_x + 2 * pad crop_box = [ target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x) ] debug = True if debug: im_debug = im.copy() crop_box_int = np.int0(crop_box) cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]), (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 255, 0), 2) # cv2.imwrite('/data/Ponc/tracking/results/windows-seagulls-debug/'+'search_'+str(arrendatario)+'.jpeg', im_debug) cv2.waitKey(0) # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) # In davis we have 5 anchors if mask_enable: score, delta, mask = net.track_mask( x_crop.to(device) ) # score: (1,10,25,25), delta: (1, 20 (5boxes*4coords), 25, 25), mask: (1, 63*63, 25, 25) else: score, delta = net.track(x_crop.to(device)) delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() # Softmax in 3125,2,which each column is BG, FG score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty target_sz_in_crop = target_sz * scale_x s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) pscore = penalty * score # cos window (motion model) N = 39 bboxes = np.zeros((6, N), dtype=np.float64) # bboxes has the shape (6 , Npoints) ; 0=res_x, 1=res_y, 2=res_w, 3=res_h, 4=score, 5=best_pscore_id_tmp pscore = pscore * (1 - p.window_influence) + window * p.window_influence attmap = score.reshape(5, 25, 25) attmap = np.amax(attmap, axis=0) # np.save('/data/Ponc/tracking/results/mevasa/'+str(arrendatario)+'.npy', attmap) best_score_threshold = 1.1 for idx in range(0, N): if (idx == 0): best_pscore_id = np.argmax(pscore) best_pscore_id_tmp = np.argmax(pscore) pred_in_crop = delta[:, best_pscore_id_tmp] / scale_x lr = penalty[best_pscore_id_tmp] * score[ best_pscore_id_tmp] * p.lr # lr for OTB res_x = pred_in_crop[0] + target_pos[0] res_y = pred_in_crop[1] + target_pos[1] res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr target_pos = np.array([res_x, res_y]) target_sz = np.array([res_w, res_h]) bboxes[0, idx] = target_pos[0] bboxes[1, idx] = target_pos[1] bboxes[2, idx] = target_sz[0] bboxes[3, idx] = target_sz[1] bboxes[4, idx] = pscore[ best_pscore_id_tmp] #BUG: This should be pscore[best_...]? bboxes[5, idx] = best_pscore_id_tmp if (pscore[best_pscore_id] > best_score_threshold): break pscore[best_pscore_id_tmp] = 0.0 target_pos = np.array([bboxes[0, 0], bboxes[1, 0]]) target_sz = np.array([bboxes[2, 0], bboxes[3, 0]]) # for Mask Branch rboxes = [] deltas = [] for idx in range(0, N): if mask_enable: best_pscore_id_mask = np.unravel_index(int( bboxes[5, idx]), (5, p.score_size, p.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[ 1] # delta_x and delta_y are the selected coordinates in the volume if ((delta_x, delta_y) not in deltas): print("delta: (", delta_x, ", ", delta_y, ")") deltas.append((delta_x, delta_y)) if refine_enable: mask = net.track_refine( (delta_y, delta_x)).to(device).sigmoid().squeeze().view( p.out_size, p.out_size).cpu().data.numpy() else: mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop s = crop_box[2] / p.instance_size sub_box = [ crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, s * p.exemplar_size, s * p.exemplar_size ] s = p.out_size / sub_box[2] back_box = [ -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s ] mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h'])) target_mask = (mask_in_img > p.seg_thr).astype(np.uint8) if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax( cnt_area)] # use max area polygon polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints( cv2.minAreaRect(polygon)) # Rotated Rectangle # box_in_img = pbox rbox_in_img = prbox box_score = bboxes[4, idx] rboxes.append([rbox_in_img, box_score]) # if(len(deltas) == 1): # attmap[delta_x, delta_y] = 3.5 # else: # attmap[delta_x, delta_y] = 1.5 if (debug): im_debug_overlay = im_debug.copy() im_debug_overlay[:, :, :] = np.array([0.0, 0.0, 0.0]) torch_data = np.float64(im_debug_overlay[:, :, 0].copy()) patch_size = crop_box_int[0] num_deltas = 25 # aixo no varia patch_ratio = int(patch_size / num_deltas) # aixo varia resized_img_h, resized_img_w = im_debug.shape[ 0] / 5, im_debug.shape[1] / 5 torch_data_delta_size = np.zeros( (int(resized_img_h), int(resized_img_w))) offset_x_deltas = int(crop_box_int[0] / 5) offset_y_deltas = int(crop_box_int[1] / 5) length_x = int((crop_box_int[2]) / 5) length_y = int((crop_box_int[3]) / 5) for i in range(25): for j in range(25): # step_x = crop_box_int[0] + i*length_x # step_y = crop_box_int[1] + j*length_y # im_debug_overlay[step_y: step_y + length_y, step_x: step_x+length_x, :] = np.array([0.0,0.0,0.0]) # im_debug_overlay[step_y: step_y + length_y, step_x: step_x+length_x, :] = np.uint8(attmap[j,i] * np.array([0,165,255])) # torch_data[step_y: step_y + length_y, step_x: step_x+length_x] = attmap[j,i]*1.0 # Now for the reshaped torch_data_delta_size[offset_y_deltas + j, offset_x_deltas + i] = attmap[j, i] * 1.0 if (pscore[best_pscore_id] > best_score_threshold): torch_data_delta_size[offset_y_deltas + delta_y, offset_x_deltas + delta_x] = 3.0 if (pscore[best_pscore_id] > best_score_threshold): sma = torch.nn.Softmax() torch_data_delta_size = sma( torch.from_numpy( np.exp(torch_data_delta_size))).numpy() # im_debug_overlay[step_x: step_x+length_x, step_y: step_y + length_y, :] = attmap[j,i] overlay_result = cv2.addWeighted( im_debug, 0.70, im_debug_overlay, 0.3, 0.0) # cv2.imwrite('/data/Ponc/tracking/results/windows-seagulls-debug/'+'search_'+str(arrendatario)+'.jpeg', overlay_result) # np.save('/data/Ponc/tracking/torch_data/resized/'+"{:05d}".format(arrendatario)+'.npy', torch_data_delta_size) # np.save('/data/Ponc/tracking/results/mevasa/'+"{:05d}".format(arrendatario)+'.npy', attmap) else: # empty mask location = cxy_wh_2_rect(target_pos, target_sz) rbox_in_img = np.array([ [location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]] ]) if (pscore[best_pscore_id] > best_score_threshold): break target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz new_score = bboxes[4, 0] # state['score'] = score[best_pscore_id] state['score'] = new_score state['mask'] = mask_in_img if mask_enable else [] state['ploygon'] = rbox_in_img if mask_enable else [] return state, bboxes, rboxes
def siamese_track(state, im, mask_enable=False, refine_enable=False, device='cpu', debug=False): """ 对目标进行跟踪 :param state:目标状态 :param im:跟踪的图像帧 :param mask_enable:是否进行掩膜 :param refine_enable:是否进行特征的融合 :param device:硬件信息 :param debug: 是否进行debug :return:跟踪目标的状态 state字典 """ # 获取目标状态 p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] # 包含周边信息的跟踪框的宽度,高度,尺寸 wc_x = target_sz[1] + p.context_amount * sum(target_sz) hc_x = target_sz[0] + p.context_amount * sum(target_sz) s_x = np.sqrt(wc_x * hc_x) # 模板模型输入框尺寸与跟踪框的比例 scale_x = p.exemplar_size / s_x # 使用与模板分支相同的比例得到检测区域 d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_x s_x = s_x + 2 * pad # 对检测框进行扩展,包含周边信息 crop_box = [ target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x) ] # 若进行debug if debug: # 复制图片 im_debug = im.copy() # 产生crop_box crop_box_int = np.int0(crop_box) # 将其绘制在图片上 cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]), (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2) # 图片展示 cv2.imshow('search area', im_debug) cv2.waitKey(0) # extract scaled crops for search region x at previous target position # 将目标位置按比例转换为要跟踪的目标 x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) # 调用网络进行目标跟踪 if mask_enable: # 进行目标分割 score, delta, mask = net.track_mask(x_crop.to(device)) else: # 只进行目标追踪,不进行分割 score, delta = net.track(x_crop.to(device)) # 目标框回归结果(将其转成4*...的样式) delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() # 目标分类结果(将其转成2*...的样式) score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() # 计算目标框的中心点坐标,delta[0],delta[1],以及宽delta[2]和高delta[3],这里变量不是很明确。 delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] def change(r): """ 将r和1/r逐位比较取最大值 :param r: :return: """ return np.maximum(r, 1. / r) def sz(w, h): """ 计算等效边长 :param w: 宽 :param h: 高 :return: 等效边长 """ pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): """ 计算等效边长 :param wh: 宽高的数组 :return: 等效边长 """ pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # 尺寸惩罚 size penalty target_sz_in_crop = target_sz * scale_x s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty # p.penalty_k超参数 penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) # 对分类结果进行惩罚 pscore = penalty * score # cos window (motion model) # 窗口惩罚:按一定权值叠加一个窗分布值 pscore = pscore * (1 - p.window_influence) + window * p.window_influence # 获取最优权值的索引 best_pscore_id = np.argmax(pscore) # 将最优的预测结果映射回原图 pred_in_crop = delta[:, best_pscore_id] / scale_x # 计算lr lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr # lr for OTB # 计算目标的位置和尺寸:根据预测偏移得到目标位置和尺寸 res_x = pred_in_crop[0] + target_pos[0] res_y = pred_in_crop[1] + target_pos[1] res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr # 目标的位置和尺寸 target_pos = np.array([res_x, res_y]) target_sz = np.array([res_w, res_h]) # for Mask Branch # 若进行分割 if mask_enable: # 获取最优预测结果的位置索引:np.unravel_index:将平面索引或平面索引数组转换为坐标数组的元组 best_pscore_id_mask = np.unravel_index(best_pscore_id, (5, p.score_size, p.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] # 是否进行特征融合 if refine_enable: # 调用track_refine,运行 Refine 模块,由相关特征图上 1×1×256 的特征向量与检测下采样前的特征图得到目标掩膜 mask = net.track_refine( (delta_y, delta_x)).to(device).sigmoid().squeeze().view( p.out_size, p.out_size).cpu().data.numpy() else: # 不进行融合时直接生成掩膜数据 mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() def crop_back(image, bbox, out_sz, padding=-1): """ 对图像进行仿射变换 :param image: 图像 :param bbox: :param out_sz: 输出尺寸 :param padding: 是否进行扩展 :return: 仿射变换后的结果 """ # 构造变换矩阵 # 尺度系数 a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] # 平移量 c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) # 进行仿射变换 crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop # 检测区域框长度与输入模型的大小的比值:缩放系数 s = crop_box[2] / p.instance_size # 预测的模板区域框 sub_box = [ crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, s * p.exemplar_size, s * p.exemplar_size ] # 缩放系数 s = p.out_size / sub_box[2] # 背景框 back_box = [ -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s ] # 仿射变换 mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h'])) # 得到掩膜结果 target_mask = (mask_in_img > p.seg_thr).astype(np.uint8) # 根据cv2的版本查找轮廓 if cv2.__version__[-5] == '4': # opencv4中返回的参数只有两个,其他版本有四个 contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # 获取轮廓的面积 cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: # 获取面积最大的轮廓 contour = contours[np.argmax(cnt_area)] # use max area polygon # 转换为...*2的形式 polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle # 得到最小外接矩形后找到该矩形的四个顶点 prbox = cv2.boxPoints( cv2.minAreaRect(polygon)) # Rotated Rectangle # box_in_img = pbox # 获得跟踪框 rbox_in_img = prbox else: # empty mask # 根据预测的目标位置和尺寸得到location location = cxy_wh_2_rect(target_pos, target_sz) # 得到跟踪框的四个顶点 rbox_in_img = np.array( [[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) # 得到目标的位置和尺寸 target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) # 更新state对象 state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score[best_pscore_id] state['mask'] = mask_in_img if mask_enable else [] state['ploygon'] = rbox_in_img if mask_enable else [] return state
locations_dict[key].append(locations) target_sz_dict[key].append(target_sz) target_pos_dict[key].append(target_pos) # TODO: DRAW location = state['ploygon'].flatten() centroids1 = compute_centroid(location) mask = state['mask'] > state['p'].seg_thr im[:, :, 2] = (mask > 0) * 255 + (mask == 0) * im[:, :, 2] cv2.polylines(im, [np.int0(location).reshape((-1, 1, 2))], True, col, 3) cv2.circle(im, (int(centroids1[0]), int(centroids1[1])), 3, col, 2) location2 = cxy_wh_2_rect(target_pos, target_sz) rbox_in_img = np.array([ [location2[0], location2[1]], [location2[0] + location2[2], location2[1]], [location2[0] + location2[2], location2[1] + location2[3]], [location2[0], location2[1] + location2[3]] ]) location2 = rbox_in_img.flatten() cv2.polylines(im, [np.int0(location2).reshape((-1, 1, 2))], True, col, 1) cv2.circle(im, (int(target_pos[0]), int(target_pos[1])), 3, col, 1) # TODO: Decide winner with Dynamics AND UPDATE TRACKER IF REQUIRED tracker = dynamics[key]
def TrackingDoing(model, state, im, mask_enable=False, device='cpu'): avg_chans = state['avg_chans'] # type(state['avg_chans']) -- <class 'numpy.ndarray'> # (Pdb) state['avg_chans'].shape -- (3,) window = state['window'] # (Pdb) state['window'] -- array([0., 0., 0., ..., 0., 0., 0.]) # (Pdb) state['window'].shape -- (3125,) target_pos = state['target_pos'] # (Pdb) state['target_pos'] -- array([390., 240.]) # (Pdb) state['target_pos'].shape -- (2,) target_size = state['target_size'] # (Pdb) state['target_size'] -- array([180, 280]) # (Pdb) state['target_size'].shape -- (2,) # mask_enable = True s_x = get_scale_size(target_size[0], target_size[1]) scale_x = model.template_size / s_x # s_x -- 457.27, scale_x -- 0.2777325006938416 # p.instance_size -- 255, p.exemplar_size -- 127 d_search = (model.instance_size - model.template_size) / 2 pad = d_search / scale_x s_x = s_x + 2 * pad crop_box = [target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x)] # (Pdb) crop_box -- [-69.0, -219.0, 918, 918] # def get_subwindow_tracking(im, pos, model_sz, original_sz, avg_chans): x_crop = get_subwindow_tracking(im, target_pos, model.instance_size, round(s_x), avg_chans).unsqueeze(0) # (Pdb) pp x_crop.shape -- torch.Size([1, 3, 255, 255]) score, delta, mask = model.track_mask(x_crop.to(device)) # (Pdb) pp score.size()-- (torch.Size([1, 10, 25, 25]), # delta.size() -- torch.Size([1, 20, 25, 25]) # mask.size() -- torch.Size([1, 3969, 25, 25])) delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() # delta.shape -- (4, 3125) # score.shape -- (3125,) delta[0, :] = delta[0, :] * model.anchor[:, 2] + model.anchor[:, 0] delta[1, :] = delta[1, :] * model.anchor[:, 3] + model.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * model.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * model.anchor[:, 3] def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty target_sz_in_crop = target_size*scale_x s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty # p.penalty_k -- 0.04 penalty = np.exp(-(r_c * s_c - 1) * model.penalty_k) pscore = penalty * score # cos window (motion model) # pp p.window_influence -- 0.4 window_influence = 0.4 pscore = pscore * (1 - window_influence) + window * window_influence best_pscore_id = np.argmax(pscore) pred_in_crop = delta[:, best_pscore_id] / scale_x lr = penalty[best_pscore_id] * score[best_pscore_id] # lr for OTB res_x = pred_in_crop[0] + target_pos[0] res_y = pred_in_crop[1] + target_pos[1] res_w = target_size[0] * (1 - lr) + pred_in_crop[2] * lr res_h = target_size[1] * (1 - lr) + pred_in_crop[3] * lr target_pos = np.array([res_x, res_y]) target_size = np.array([res_w, res_h]) # for Mask Branch # pp mask_enable -- True if mask_enable: best_pscore_id_mask = np.unravel_index(best_pscore_id, (5, model.score_size, model.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] # pp model.template_size -- 127 mask = model.track_refine((delta_y, delta_x)).to(device).sigmoid().squeeze().view( model.template_size, model.template_size).cpu().data.numpy() def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop # pp p.instance_size -- 255 s = crop_box[2] / model.instance_size # pp p.base_size -- 8 # (Pdb) pp p.total_stride -- 8 # pp p.exemplar_size -- 127 sub_box = [crop_box[0] + (delta_x - model.anchors["base_size"] / 2) * model.anchors["stride"] * s, crop_box[1] + (delta_y - model.anchors["base_size"] / 2) * model.anchors["stride"] * s, s * model.template_size, s * model.template_size] s = model.template_size / sub_box[2] back_box = [-sub_box[0] * s, -sub_box[1] * s, state['image_width'] * s, state['image_height'] * s] mask_in_img = crop_back(mask, back_box, (state['image_width'], state['image_height'])) # mask.shape -- (127, 127) # (Pdb) back_box -- [-44.833333333333336, -3.1666666666666683, 237.22222222222223, 133.33333333333334] # (Pdb) mask_in_img.shape -- (480 -- state['image_height'], 854 -- width) # pp p.segment_threshold -- 0.35 target_mask = (mask_in_img > model.segment_threshold).astype(np.uint8) # cv2.__version__ -- '4.4.0' ==> cv2.__version__[-5] == '4' if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax(cnt_area)] # use max area polygon polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints(cv2.minAreaRect(polygon)) # Rotated Rectangle rbox_in_img = prbox else: # empty mask location = cxy_wh_2_rect(target_pos, target_size) rbox_in_img = np.array([[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) # type(state['image_width']) -- <class 'int'> target_pos[0] = max(0, min(state['image_width'], target_pos[0])) target_pos[1] = max(0, min(state['image_height'], target_pos[1])) target_size[0] = max(10, min(state['image_width'], target_size[0])) target_size[1] = max(10, min(state['image_height'], target_size[1])) state['target_pos'] = target_pos state['target_size'] = target_size state['score'] = score[best_pscore_id] state['mask'] = mask_in_img state['ploygon'] = rbox_in_img return state
def track_vot(model, video, hp=None, mask_enable=False, refine_enable=False, device='cpu'): #regions记录目标框以及状态。 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video[ 'gt'] #gt是groundtruth(325, 8),数组,img_files是一个视频中所有路径构成的列表 start_frame, end_frame, lost_times, toc = 0, len( image_files), 0, 0 #len(image_files)是视频帧数,每个视频不同 #遍历当前视频中的所有图像,f是索引,image_file是单个帧的路径,由目标出现的帧初始化。 for f, image_file in enumerate(image_files): im = cv2.imread(image_file) #(h*w*3) tic = cv2.getTickCount() #记录当前时间 if f == start_frame: # init初始化,如果是第一帧 cx, cy, w, h = get_axis_aligned_bbox( gt[f]) #将gt中任意方向的矩形(标识目标)转换成轴对称的矩形 target_pos = np.array([cx, cy]) #轴对称矩形中心 target_sz = np.array([w, h]) state = siamese_init(im, target_pos, target_sz, model, hp, device) # init tracker初始化跟踪器 #输入是一帧图像数据,gt的(cx,cy)和(w,h), model, hp, device location = cxy_wh_2_rect( state['target_pos'], state['target_sz'] ) #得到的location为轴对称矩形左上角坐标(x,y,w,h)ndarray<(4,), float64> regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking ,接下来的所有帧,在后续帧跟踪,由state获取目标框。 if f == 3: exit() state = siamese_track(state, im, mask_enable, refine_enable, device, args.debug) # track if mask_enable: location = state['ploygon'].flatten() mask = state['mask'] #mask.shape和原图一样 else: location = cxy_wh_2_rect( state['target_pos'], state['target_sz']) #解码出的预测框左上角(x,y),(w,h) mask = [] if 'VOT' in args.dataset: gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]), (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7])) if mask_enable: pred_polygon = ((location[0], location[1]), (location[2], location[3]), (location[4], location[5]), (location[6], location[7])) else: pred_polygon = ((location[0], location[1]), (location[0] + location[2], location[1]), (location[0] + location[2], location[1] + location[3]), (location[0], location[1] + location[3])) # 无论怎样locatioon都是预测得到的,计算预测和实际的多边形之间的重叠 b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0])) else: #vot_overlap 计算两个多边形gt_polygon, pred_polygon之间的重叠。 b_overlap = 1 if b_overlap: #值为真,有重叠 regions.append(location) else: # lost regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append(0) toc += cv2.getTickCount() - tic #处理完one video后进行显示 if args.visualization and f >= start_frame: # visualization (skip lost frame) im_show = im.copy() if f == 0: cv2.destroyAllWindows() if gt.shape[0] > f: if len(gt[f]) == 8: cv2.polylines( im_show, [np.array(gt[f], np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) else: cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3) if len(location) == 8: if mask_enable: mask = mask > state['p'].seg_thr im_show[:, :, 2] = mask * 255 + (1 - mask) * im_show[:, :, 2] location_int = np.int0(location) cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: location = [int(l) for l in location] cv2.rectangle( im_show, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(im_show, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.putText(im_show, str(state['score']) if 'score' in state else '', (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video['name'], im_show) cv2.waitKey(1) toc /= cv2.getTickFrequency() # 下面是后续处理工作 # save result跟踪完成,记录结果到文本文件。 name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\ args.resume.split('/')[-1].split('.')[0] if 'VOT' in args.dataset: video_path = join('test', args.dataset, name, 'baseline', video['name']) if not isdir(video_path): makedirs(video_path) result_path = join(video_path, '{:s}_001.txt'.format(video['name'])) with open(result_path, "w") as fin: for x in regions: fin.write("{:d}\n".format(x)) if isinstance(x, int) else \ fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') else: # OTB video_path = join('test', args.dataset, name) if not isdir(video_path): makedirs(video_path) result_path = join(video_path, '{:s}.txt'.format(video['name'])) with open(result_path, "w") as fin: for x in regions: fin.write(','.join([str(i) for i in x]) + '\n') logger.info( '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'. format(v_id, video['name'], toc, f / toc, lost_times)) return lost_times, f / toc
def siamese_track(state, im, mask_enable=False, refine_enable=False, device='cpu', debug=False): p = state['p'] #state['p']是TrackerConfig类 net = state['net'] #siamese_init中的model,模型即是网络 avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] #中心点坐标 target_sz = state['target_sz'] #(w,h) #由扩展后的宽高计算等效面积。使用与模板分支相同的缩放系数得到检测区域。 wc_x = target_sz[1] + p.context_amount * sum( target_sz) #h +p.context_amount*(w+h) hc_x = target_sz[0] + p.context_amount * sum( target_sz) #w +p.context_amount*(w+h) s_x = np.sqrt( wc_x * hc_x ) #这个s_x是作为template时,以物体为中心,s_x为宽高,截取一个正方体的物体出来,然后再resize到(127,127),这个s_x时框的大约2倍的放大 scale_x = p.exemplar_size / s_x #scale_x是放大的倍数 d_search = (p.instance_size - p.exemplar_size) / 2 #64 pad = d_search / scale_x #pad = 64*s_x/127 s_x = s_x + 2 * pad #这个s_x是作为search时,以物体为中心,s_x为宽高,截取一个正方体的物体出来,然后再resize到(255,255),这个s_x时框的大约4倍的放大 crop_box = [ target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x) ] ##(x上,y上,w,h) #crop_box就是search未resize的原图 if debug: im_debug = im.copy() crop_box_int = np.int0(crop_box) #np.int0向下取整 cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]), (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2) cv2.imshow('search area', im_debug) cv2.waitKey(0) #提取按比例缩放的剪裁在之前的目标位置,为x # extract scaled crops for search region x at previous target position以上一帧的target_pos为依据生成search,毕竟物体位置差别不大。 x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) #x_corp.shape = [255,255,3]----->(1, 3, 255, 255), float32, cpu> #运行网络 if mask_enable: #如果运训mask分支的话 score, delta, mask = net.track_mask(x_crop.to(device)) #三分支分别的结果 # New var:....... score = tensor<(1, 10, 25, 25), float32, cuda:0, grad> 2*k = 10 # New var:....... delta = tensor<(1, 20, 25, 25), float32, cuda:0, grad> 4*k = 20 k = 5 # New var:....... mask = tensor<(1, 3969, 25, 25), float32, cuda:0, grad> else: score, delta = net.track(x_crop.to(device)) #解码出预测框,并根据位置、宽高比和位移量惩罚得分,挑选出最优预测。torch.permute(dims),将tensor的维度换位。 #即使用transpose或permute进行维度变换后,调用contiguous,然后方可使用view对维度进行变形。 #.data.cpu().numpy() GPUtensor-->CPUtensor-->numpy #.data[:,1],取tensor所有行的第二列 delta = delta.permute(1, 2, 3, 0).contiguous().view( 4, -1).data.cpu().numpy() #ndarray<(4, 3125), float32> score = F.softmax( score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1 ).data[:, 1].cpu().numpy( ) #score = ndarray<(3125,), float32>,,torch.nn.functional.softmax(input)非线性激活函数 delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] #cx = cx *w+cx delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] #cy = cy * h+cy delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] #w = exp(w) *w delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] #h = exp(h) * h #np.maximum:(X, Y, out=None) #X 与 Y 逐位比较取其大者; def change(r): return np.maximum( r, 1. / r) #[0.33, 0.5, 1, 2, 3],[3.03,2,1, 0.5,0.333] --->[3, 2, 1, 2, 3] def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty target_sz_in_crop = target_sz * scale_x #target_sz_in_crop = ndarray<(2,), float64> s_c = change(sz(delta[2, :], delta[3, :]) / ( sz_wh(target_sz_in_crop))) # scale penalty ndarray<(3125,), float32> r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty ndarray<(3125,), float32> penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) #ndarray<(3125,), float32> pscore = penalty * score #ndarray<(3125,), float32> # cos window (motion model) pscore = pscore * ( 1 - p.window_influence ) + window * p.window_influence #ndarray<(3125,), float64> best_pscore_id = np.argmax( pscore) #挑选出得分最高的。通过class score分支的最高得分选取一根柱子用于生成mask,同时也将对应的最优框选出来了 pred_in_crop = delta[:, best_pscore_id] / scale_x #找到在search中偏差的位置,用于选择最优框pred_in_crop = ndarray<(4,), float32>,其实是偏差 lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr # lr for OTB res_x = pred_in_crop[0] + target_pos[0] #x的偏差加上原来的x res_y = pred_in_crop[1] + target_pos[1] #y的偏差加上原来的y res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr target_pos = np.array([res_x, res_y]) #从这里往下,target_pos和target_sz改变。 target_sz = np.array([res_w, res_h]) #解码出来的框,就一个 # for Mask Branch #numpy.unravel_index 将平面索引或平面索引数组转换为坐标数组的元组。 #由best_pscore_id得到特征图上的torchsnooper位置。track_refine 函数运行 Refine 模块, #由相关特征图上 1×1×256的特征向量与检测下采样前的特征图得到目标掩膜。 #上面的mask是整体的,现在要得出一根ROW if mask_enable: #允许mask best_pscore_id_mask = np.unravel_index( best_pscore_id, (5, p.score_size, p.score_size)) #(3,delta_y,delta_x) 在pscore中找对应的mask中的位置 delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] #根据最高得分id选择一根最优柱子用于生成mask,现在已经有坐标位置了(delta_y, delta_x) if refine_enable: #用sharprefine模块 mask = net.track_refine((delta_y, delta_x)).to(device).sigmoid( ).squeeze().view(p.out_size, p.out_size).cpu().data.numpy( ) #mask = <tensor<(1, 16129)------>darray<(127, 127), float32> else: #不用sharprefine模块,而是选出最高得分的mask mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() #ndarray<(127, 127), float32>,因为p用model.anchors更新了一下 #上面生成了mask,现在要映射回原图,warpAffine() 对图像应用仿射变换。 #手动构造变换矩阵mapping,a和b为尺度系数,c和d为平移量。 def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop #crop_box为检测截取框,格式为[x,y,width,height]。s为缩放系数。sub_box为预测的模板区域框。 s = crop_box[2] / p.instance_size #s是标量,,round(s_x)/255 sub_box = [ crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, #x上 + (delta_x - 4) * 8 * s crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, #y上 + (delta_y - 4) * 8 * s s * p.exemplar_size, s * p.exemplar_size ] #列表,四个元素 #s*127,s*127 s = p.out_size / sub_box[2] back_box = [ -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s ] #列表,四个元素 #输入的mask为ndarray<(127, 127), float32> mask_in_img = crop_back( mask, back_box, (state['im_w'], state['im_h'])) #和im原视频帧size一样,,float32 target_mask = (mask_in_img > p.seg_thr).astype( np.uint8) ##和im原视频帧size一样,uint8>二维 0-1值 if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: #这个成立我的版本是3.* _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) #三维,就一个轮廓 cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max( cnt_area) > 100: #有轮廓并且最大的轮廓面积大于100,说明轮廓不小 contour = contours[np.argmax( cnt_area )] # use max area polygon,,(n , 1 , 2)维的numpy.ndarray,n是坐标的个数 polygon = contour.reshape(-1, 2) #二维 # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints(cv2.minAreaRect( polygon)) # Rotated RectangleboxPoints 查找旋转矩形的四个顶点。用于绘制旋转的矩形。 #cv2.minAreaRect(polygon),生成最小外接矩形,输入时多边形点集必须是array数组的形式,输出是(中心(x,y), (宽,高), 旋转角度) #cv2.boxPoints(rect)获取最小外接矩形的4个顶点坐标,返回形式[ [x0,y0], [x1,y1], [x2,y2], [x3,y3] ]。 #prbox = ndarray<(4, 2),就是得到的目标框的四个顶点,俗称旋转框,因为有角度 # box_in_img = pbox rbox_in_img = prbox #rbox_in_img = ndarray<(4, 2), float32> else: # empty mask轮廓太小的话 location = cxy_wh_2_rect(target_pos, target_sz) #得到左上角坐标(x,y)和(w,h) rbox_in_img = np.array( [[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) # 到此,rbox_in_img就是经过“和原图size一样大的mask”经过轮廓操作或者直接由预测的(target_pos, target_sz)生成的--目标框的四个顶点 #由结果更新状态。 target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score[best_pscore_id] state['mask'] = mask_in_img if mask_enable else [] state['ploygon'] = rbox_in_img if mask_enable else [] return state