def test(dataset, tracker, opt, ctx): """SiamRPN test.""" cv2 = try_import_cv2() for v_idx, video in enumerate(dataset): if opt.video != '': if video.name != opt.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: x_max, y_max, gt_w, gt_t = get_axis_aligned_bbox( np.array(gt_bbox)) gt_bbox_ = [ x_max - (gt_w - 1) / 2, y_max - (gt_t - 1) / 2, gt_w, gt_t ] tracker.init(img, gt_bbox_, ctx) pred_bbox = gt_bbox_ scores.append(None) pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img, ctx) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if opt.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle( img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() model_path = os.path.join(opt.results_path, opt.dataset, opt.model_path.split('/')[-1].split('.')[0]) if not os.path.isdir(model_path): os.makedirs(model_path) with open(os.path.join(model_path, '{}.txt'.format(video.name)), 'w') as f_w: for per_pbbox in pred_bboxes: f_w.write(','.join([str(i) for i in per_pbbox]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( v_idx + 1, video.name, toc, len(video) / toc))
def __init__(self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False): self.name = name self.video_dir = video_dir self.init_rect = init_rect self.gt_traj = gt_rect self.attr = attr self.pred_trajs = {} self.img_names = [os.path.join(root, x) for x in img_names] self.imgs = None cv2 = try_import_cv2() if load_img: self.imgs = [cv2.imread(x) for x in self.img_names] self.width = self.imgs[0].shape[1] self.height = self.imgs[0].shape[0] else: img = cv2.imread(self.img_names[0]) assert img is not None, self.img_names[0] self.width = img.shape[1] self.height = img.shape[0]
def __iter__(self): for i in range(len(self.img_names)): if self.imgs is not None: yield self.imgs[i], self.gt_traj[i] else: cv2 = try_import_cv2() yield cv2.imread(self.img_names[i]), self.gt_traj[i]
def crop_xml(args, xml, sub_set_crop_path, instance_size=511): """ Dataset curation Parameters ---------- xml: str , xml sub_set_crop_path: str, xml crop path instance_size: int, instance_size """ cv2 = try_import_cv2() xmltree = ET.parse(xml) objects = xmltree.findall('object') frame_crop_base_path = os.path.join(sub_set_crop_path, xml.split('/')[-1].split('.')[0]) if not os.path.isdir(frame_crop_base_path): makedirs(frame_crop_base_path) img_path = xml.replace('xml', 'JPEG').replace('Annotations', 'Data') im = cv2.imread(img_path) avg_chans = np.mean(im, axis=(0, 1)) for id, object_iter in enumerate(objects): bndbox = object_iter.find('bndbox') bbox = [int(bndbox.find('xmin').text), int(bndbox.find('ymin').text), int(bndbox.find('xmax').text), int(bndbox.find('ymax').text)] z, x = crop_like_SiamFC(im, bbox, instance_size=instance_size, padding=avg_chans) cv2.imwrite(os.path.join(args.download_dir, frame_crop_base_path, '{:06d}.{:02d}.z.jpg'.format(0, id)), z) cv2.imwrite(os.path.join(args.download_dir, frame_crop_base_path, '{:06d}.{:02d}.x.jpg'.format(0, id)), x)
def crop_video(args, sub_set, video, crop_path, ann_base_path): """ Dataset curation Parameters ---------- sub_set: str , sub_set video: str, video number crop_path: str, crop_path ann_base_path: str, Annotations base path """ cv2 = try_import_cv2() video_crop_base_path = os.path.join(crop_path, sub_set, video) if not os.path.isdir(video_crop_base_path): makedirs(video_crop_base_path) sub_set_base_path = os.path.join(ann_base_path, sub_set) xmls = sorted(glob.glob(os.path.join(sub_set_base_path, video, '*.xml'))) for xml in xmls: xmltree = ET.parse(xml) objects = xmltree.findall('object') objs = [] filename = xmltree.findall('filename')[0].text im = cv2.imread(xml.replace('xml', 'JPEG').replace('Annotations', 'Data')) avg_chans = np.mean(im, axis=(0, 1)) for object_iter in objects: trackid = int(object_iter.find('trackid').text) bndbox = object_iter.find('bndbox') bbox = [int(bndbox.find('xmin').text), int(bndbox.find('ymin').text), int(bndbox.find('xmax').text), int(bndbox.find('ymax').text)] z, x = crop_like_SiamFC(im, bbox, instance_size=args.instance_size, padding=avg_chans) cv2.imwrite(os.path.join(args.download_dir, video_crop_base_path, '{:06d}.{:02d}.z.jpg'.format(int(filename), trackid)), z) cv2.imwrite(os.path.join(args.download_dir, video_crop_base_path, '{:06d}.{:02d}.x.jpg'.format(int(filename), trackid)), x)
def __init__(self, shift, scale, blur, flip, color): self.shift = shift self.scale = scale self.blur = blur self.flip = flip self.color = color self.rgbVar = np.array([[-0.55919361, 0.98062831, -0.41940627], [1.72091413, 0.19879334, -1.82968581], [4.64467907, 4.73710203, 4.88324118]], dtype=np.float32) self.cv2 = try_import_cv2()
def crop_img(img, anns, set_crop_base_path, set_img_base_path, instance_size=511): """ Dataset curation Parameters ---------- img: dic, img anns: str, video number set_crop_base_path: str, crop result path set_img_base_path: str, ori image path """ frame_crop_base_path = os.path.join(set_crop_base_path, img['file_name'].split('/')[-1].split('.')[0]) if not os.path.isdir(frame_crop_base_path): makedirs(frame_crop_base_path) cv2 = try_import_cv2() im = cv2.imread('{}/{}'.format(set_img_base_path, img['file_name'])) avg_chans = np.mean(im, axis=(0, 1)) for trackid, ann in enumerate(anns): rect = ann['bbox'] bbox = [rect[0], rect[1], rect[0] + rect[2], rect[1] + rect[3]] if rect[2] <= 0 or rect[3] <= 0: continue z, x = crop_like_SiamFC(im, bbox, instance_size=instance_size, padding=avg_chans) cv2.imwrite(os.path.join(args.download_dir, frame_crop_base_path, '{:06d}.{:02d}.z.jpg'.format(0, trackid)), z) cv2.imwrite(os.path.join(args.download_dir, frame_crop_base_path, '{:06d}.{:02d}.x.jpg'.format(0, trackid)), x)
def crop_hwc(image, bbox, out_sz, padding=(0, 0, 0)): """ crop image Parameters ---------- image: np.array, image bbox: np or list, bbox coordinate [xmin,ymin,xmax,ymax] out_sz: int , crop image size Return: crop result """ a = (out_sz - 1) / (bbox[2] - bbox[0]) b = (out_sz - 1) / (bbox[3] - bbox[1]) c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) cv2 = try_import_cv2() crop = cv2.warpAffine(image, mapping, (out_sz, out_sz), borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop
def __init__( self, data_path=os.path.expanduser('~/.mxnet/datasets'), dataset_names=('vid', 'yt_bb', 'coco', 'det'), detaset_root=('vid/crop511', 'yt_bb/crop511', 'coco/crop511', 'det/crop511'), detaset_anno=('vid/train.json', 'yt_bb/train.json', 'coco/train2017.json', 'det/train.json'), dataset_frame_range=(100, 3, 1, 1), dataset_num_use=(100000, -1, -1, -1), train_search_size=255, train_exemplar_size=127, anchor_stride=8, anchor_ratios=(0.33, 0.5, 1, 2, 3), train_base_size=0, train_output_size=17, template_shift=4, template_scale=0.05, template_blur=0, template_flip=0, template_color=1.0, search_shift=64, search_scale=0.18, search_blur=0, search_flip=0, search_color=1.0, videos_per_epoch=600000, train_epoch=50, train_thr_high=0.6, train_thr_low=0.3, train_pos_num=16, train_neg_num=16, train_total_num=64, gray=0.0, neg=0.05, ): super(TrkDataset, self).__init__() self.train_search_size = train_search_size self.train_exemplar_size = train_exemplar_size self.anchor_stride = anchor_stride self.anchor_ratios = list(anchor_ratios) self.train_base_size = train_base_size self.train_output_size = train_output_size self.data_path = data_path self.dataset_names = list(dataset_names) self.detaset_root = list(detaset_root) self.detaset_anno = list(detaset_anno) self.dataset_frame_range = list(dataset_frame_range) self.dataset_num_use = list(dataset_num_use) self.template_shift = template_shift self.template_scale = template_scale self.template_blur = template_blur self.template_flip = template_flip self.template_color = template_color self.search_shift = search_shift self.search_scale = search_scale self.search_blur = search_blur self.search_flip = search_flip self.search_color = search_color self.videos_per_epoch = videos_per_epoch self.train_epoch = train_epoch self.train_thr_high = train_thr_high self.train_thr_low = train_thr_low self.train_pos_num = train_pos_num self.train_neg_num = train_neg_num self.train_total_num = train_total_num self.gray = gray self.neg = neg self.cv2 = try_import_cv2() self.logger = logging.getLogger() desired_size = (self.train_search_size - self.train_exemplar_size) / \ self.anchor_stride + 1 + self.train_base_size if desired_size != self.train_output_size: raise Exception('size not match!') # create anchor target self.anchor_target = AnchorTarget( anchor_stride=self.anchor_stride, anchor_ratios=self.anchor_ratios, train_search_size=self.train_search_size, train_output_size=self.train_output_size, train_thr_high=self.train_thr_high, train_thr_low=self.train_thr_low, train_pos_num=self.train_pos_num, train_neg_num=self.train_neg_num, train_total_num=self.train_total_num) # create sub dataset self.all_dataset = [] start = 0 self.num = 0 for idx in range(len(self.dataset_names)): sub_dataset = SubDataset( self.dataset_names[idx], os.path.join(self.data_path, self.detaset_root[idx]), os.path.join(self.data_path, self.detaset_anno[idx]), self.dataset_frame_range[idx], self.dataset_num_use[idx], start) start += sub_dataset.num self.num += sub_dataset.num_use sub_dataset.log() self.all_dataset.append(sub_dataset) # data augmentation self.template_aug = SiamRPNaugmentation(self.template_shift, self.template_scale, self.template_blur, self.template_flip, self.template_color) self.search_aug = SiamRPNaugmentation(self.search_shift, self.search_scale, self.search_blur, self.search_flip, self.search_color) videos_per_epoch = self.videos_per_epoch self.num = videos_per_epoch if videos_per_epoch > 0 else self.num self.num *= self.train_epoch self.pick = self.shuffle()
def get_subwindow(self, img, pos, model_sz, original_sz, avg_chans, ctx): """ Adjust the position of the frame to prevent the boundary from being exceeded. If the boundary is exceeded, the average value of each channel of the image is used to replace the exceeded value. Parameters ---------- im : np.ndarray BGR based image pos : list center position model_sz : array exemplar size, x is 127, z is 287 in ours original_sz: array original size avg_chans : array channel average ctx : mxnet.Context Context such as mx.cpu(), mx.gpu(0). Returns ------- rejust window though avg channel """ cv2 = try_import_cv2() if isinstance(pos, float): pos = [pos, pos] im_sz = img.shape original_c = (original_sz + 1) / 2 context_xmin = np.floor(pos[0] - original_c + 0.5) context_xmax = context_xmin + original_sz - 1 context_ymin = np.floor(pos[1] - original_c + 0.5) context_ymax = context_ymin + original_sz - 1 left_pad = int(max(0., -context_xmin)) top_pad = int(max(0., -context_ymin)) right_pad = int(max(0., context_xmax - im_sz[1] + 1)) bottom_pad = int(max(0., context_ymax - im_sz[0] + 1)) context_xmin = context_xmin + left_pad context_xmax = context_xmax + left_pad context_ymin = context_ymin + top_pad context_ymax = context_ymax + top_pad im_h, im_w, im_c = img.shape if any([top_pad, bottom_pad, left_pad, right_pad]): #If there is a pad, use the average channels to complete size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_c) te_im = np.zeros(size, np.uint8) te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = img if top_pad: te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans if bottom_pad: te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans if left_pad: te_im[:, 0:left_pad, :] = avg_chans if right_pad: te_im[:, im_w + left_pad:, :] = avg_chans im_patch = te_im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :] else: #If there is no pad, crop Directly im_patch = img[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :] if not np.array_equal(model_sz, original_sz): im_patch = cv2.resize(im_patch, (model_sz, model_sz)) im_patch = im_patch.transpose(2, 0, 1) im_patch = im_patch[np.newaxis, :, :, :] im_patch = im_patch.astype(np.float32) im_patch = mx.nd.array(im_patch, ctx) return im_patch
def __getitem__(self, idx): if self.imgs is None: cv2 = try_import_cv2() return cv2.imread(self.img_names[idx]), self.gt_traj[idx] else: return self.imgs[idx], self.gt_traj[idx]
def load_img(self): if self.imgs is None: cv2 = try_import_cv2() self.imgs = [cv2.imread(x) for x in self.img_names] self.width = self.imgs[0].shape[1] self.height = self.imgs[0].shape[0]
"""SiamRPN Demo script. Code adapted from https://github.com/STVIR/pysot""" import os import argparse import matplotlib.pyplot as plt import numpy as np import mxnet as mx from gluoncv import model_zoo, utils from gluoncv.model_zoo.siamrpn.siamrpn_tracker import SiamRPNTracker as build_tracker from gluoncv.model_zoo.siamrpn.siamrpn_tracker import get_axis_aligned_bbox from gluoncv.utils.filesystem import try_import_cv2 cv2 = try_import_cv2() def parse_args(): """ benchmark test.""" parser = argparse.ArgumentParser(description='make ovject tracking.') parser.add_argument( '--data-dir', type=str, default='', help='if video-loader set to True, data-dir store videos frames.') parser.add_argument( '--video-loader', action='store_true', default=True, help='if set to True, read videos directly instead of reading frames.') parser.add_argument( '--video-path', default= 'https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/tracking/Coke.mp4',
def __init__( self, root_bgs=os.path.expanduser( '/media/hp/data/BGSDecom/FrameDifference/bgs'), root_fgs=os.path.expanduser( '/media/hp/data/BGSDecom/FrameDifference/fgs'), setting=os.path. expanduser( '/home/hp/.mxnet/datasets/ucf101/ucfTrainTestlist/ucf101_train_split_2_rawframes.txt' ), train=True, test_mode=False, name_pattern='img_%05d.jpg', video_ext='mp4', is_color=True, modality='rgb', num_segments_bgs=1, num_segments_fgs=1, new_length_bgs=1, new_length_fgs=5, new_step_bgs=1, new_step_fgs=1, new_width=340, new_height=256, target_width=224, target_height=224, temporal_jitter=False, video_loader=False, use_decord=False, transform=None): super(UCF101_2stream, self).__init__() from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv self.cv2 = try_import_cv2() self.root_bgs = root_bgs self.root_fgs = root_fgs self.setting = setting self.train = train self.test_mode = test_mode self.is_color = is_color self.modality = modality self.num_segments_bgs = num_segments_bgs self.num_segments_fgs = num_segments_fgs self.new_height = new_height self.new_width = new_width self.new_length_fgs = new_length_fgs self.new_length_bgs = new_length_bgs self.new_step_bgs = new_step_bgs self.new_step_fgs = new_step_fgs self.skip_length_bgs = self.new_length_bgs * self.new_step_bgs self.skip_length_fgs = self.new_length_fgs * self.new_step_fgs self.target_height = target_height self.target_width = target_width self.transform = transform self.temporal_jitter = temporal_jitter # False self.video_loader = video_loader self.video_ext = video_ext self.use_decord = use_decord if self.video_loader: if self.use_decord: self.decord = try_import_decord() else: self.mmcv = try_import_mmcv() # self.classes, self.class_to_idx = self._find_classes(root) self.clips = self._make_dataset(root_bgs, root_fgs, setting) if len(self.clips) == 0: raise (RuntimeError("Found 0 video clips in subfolders of: " + root_bgs + "\n" "Check your data directory (opt.data-dir).")) if name_pattern: self.name_pattern = name_pattern else: if self.modality == "rgb": self.name_pattern = "img_%05d.jpg" elif self.modality == "flow": self.name_pattern = "flow_%s_%05d.jpg"
def __init__(self, root, setting, train=True, test_mode=False, name_pattern='img_%05d.jpg', video_ext='mp4', is_color=True, modality='rgb', num_segments=1, num_crop=1, new_length=1, new_step=1, new_width=340, new_height=256, target_width=224, target_height=224, temporal_jitter=False, video_loader=False, use_decord=False, slowfast=False, slow_temporal_stride=16, fast_temporal_stride=2, data_aug='v1', lazy_init=False, transform=None): super(VideoClsCustom, self).__init__() from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv self.cv2 = try_import_cv2() self.root = root self.setting = setting self.train = train self.test_mode = test_mode self.is_color = is_color self.modality = modality self.num_segments = num_segments self.num_crop = num_crop self.new_height = new_height self.new_width = new_width self.new_length = new_length self.new_step = new_step self.skip_length = self.new_length * self.new_step self.target_height = target_height self.target_width = target_width self.transform = transform self.temporal_jitter = temporal_jitter self.name_pattern = name_pattern self.video_loader = video_loader self.video_ext = video_ext self.use_decord = use_decord self.slowfast = slowfast self.slow_temporal_stride = slow_temporal_stride self.fast_temporal_stride = fast_temporal_stride self.data_aug = data_aug self.lazy_init = lazy_init if self.slowfast: assert slow_temporal_stride % fast_temporal_stride == 0, 'slow_temporal_stride needs to be multiples of slow_temporal_stride, please set it accordinly.' assert not temporal_jitter, 'Slowfast dataloader does not support temporal jitter. Please set temporal_jitter=False.' assert new_step == 1, 'Slowfast dataloader only support consecutive frames reading, please set new_step=1.' if self.video_loader: if self.use_decord: self.decord = try_import_decord() else: self.mmcv = try_import_mmcv() if not self.lazy_init: self.clips = self._make_dataset(root, setting) if len(self.clips) == 0: raise ( RuntimeError("Found 0 video clips in subfolders of: " + root + "\n" "Check your data directory (opt.data-dir)."))