class YTVOSDataset(Dataset): def __init__(self, data_path=None, train=True, valid=False, set_index=1, finetune_idx=None, support_frame=5, query_frame=1, sample_per_class=10, transforms=None, another_transform=None): self.train = train self.valid = valid self.set_index = set_index self.support_frame = support_frame self.query_frame = query_frame self.sample_per_class = sample_per_class self.transforms = transforms self.another_transform = another_transform if data_path is None: data_path = os.path.join(os.path.expanduser('~'), 'Lab/DANet') data_dir = os.path.join(data_path, 'data', 'Youtube-VOS') self.img_dir = os.path.join(data_dir, 'train', 'JPEGImages') self.ann_file = os.path.join(data_dir, 'train', 'train.json') self.load_annotations() print('data set index: ', set_index) self.train_list = [ n + 1 for n in range(40) if n % 4 != (set_index - 1) ] self.valid_list = [ n + 1 for n in range(40) if n % 4 == (set_index - 1) ] if train and not valid: self.class_list = self.train_list else: self.class_list = self.valid_list if finetune_idx is not None: self.class_list = [self.class_list[finetune_idx]] self.video_ids = [] for class_id in self.class_list: tmp_list = self.ytvos.getVidIds(catIds=class_id) tmp_list.sort() self.video_ids.append(tmp_list) # list[list[video_id]] if not self.train: self.test_video_classes = [] for i in range(len(self.class_list)): for j in range(len(self.video_ids[i]) - support_frame): # remove the support set self.test_video_classes.append(i) if self.train: self.length = len(self.class_list) * sample_per_class else: self.length = len(self.test_video_classes) # test def load_annotations(self): self.ytvos = YTVOS(self.ann_file) self.vid_ids = self.ytvos.getVidIds() # list[2238] begin : 1 self.vid_infos = self.ytvos.vids # vids for vid, vid_info in self.vid_infos.items(): # for each vid vid_name = vid_info['file_names'][0].split('/')[0] # '0043f083b5' vid_info['dir'] = vid_name frame_len = vid_info['length'] # int frame_object, frame_class = [], [] for i in range(frame_len): frame_object.append([]) for i in range(frame_len): frame_class.append([]) category_set = set() annos = self.ytvos.vidToAnns[vid] # list[] for anno in annos: # instance_level anns assert len(anno['segmentations']) == frame_len, ( vid_name, len(anno['segmentations']), vid_info['length']) for frame_idx in range(frame_len): anno_segmentation = anno['segmentations'][frame_idx] if anno_segmentation is not None: frame_object[frame_idx].append( anno['id']) # add instance to vid_frame frame_class[frame_idx].append( anno['category_id'] ) # add instance class to vid_frame category_set = category_set.union( {anno['category_id']}) vid_info['objects'] = frame_object vid_info['classes'] = frame_class class_frame_id = dict() for class_id in category_set: # frames index for each class class_frame_id[class_id] = [ i for i in range(frame_len) if class_id in frame_class[i] ] vid_info['class_frames'] = class_frame_id def get_GT_byclass(self, vid, class_id, frame_num=1, test=False): vid_info = self.vid_infos[vid] frame_list = vid_info['class_frames'][class_id] frame_len = len(frame_list) choice_frame = random.sample(frame_list, 1) if test: frame_num = frame_len if frame_num > 1: if frame_num <= frame_len: choice_idx = frame_list.index(choice_frame[0]) if choice_idx < frame_num: begin_idx = 0 end_idx = frame_num else: begin_idx = choice_idx - frame_num + 1 end_idx = choice_idx + 1 choice_frame = [ frame_list[n] for n in range(begin_idx, end_idx) ] else: choice_frame = [] for i in range(frame_num): if i < frame_len: choice_frame.append(frame_list[i]) else: choice_frame.append(frame_list[frame_len - 1]) frames = [ np.array( Image.open( os.path.join(self.img_dir, vid_info['file_names'][frame_idx]))) for frame_idx in choice_frame ] masks = [] for frame_id in choice_frame: object_ids = vid_info['objects'][frame_id] mask = None for object_id in object_ids: ann = self.ytvos.loadAnns(object_id)[0] if ann['category_id'] not in self.class_list: continue track_id = 1 if ann['category_id'] != class_id: track_id = 0 temp_mask = self.ytvos.annToMask(ann, frame_id) if mask is None: mask = temp_mask * track_id else: mask += temp_mask * track_id assert mask is not None mask[mask > 0] = 1 masks.append(mask) return frames, masks def __gettrainitem__(self, idx): list_id = idx // self.sample_per_class vid_set = self.video_ids[list_id] query_vid = random.sample(vid_set, 1) support_vid = random.sample(vid_set, self.support_frame) query_frames, query_masks = self.get_GT_byclass( query_vid[0], self.class_list[list_id], self.query_frame) support_frames, support_masks = [], [] for i in range(self.support_frame): one_frame, one_mask = self.get_GT_byclass(support_vid[i], self.class_list[list_id], 1) support_frames += one_frame support_masks += one_mask if self.transforms is not None: query_frames, query_masks = self.transforms( query_frames, query_masks) support_frames, support_masks = self.transforms( support_frames, support_masks) return query_frames, query_masks, support_frames, support_masks, self.class_list[ list_id] def __gettestitem__(self, idx): # random.seed() begin_new = False if idx == 0: begin_new = True else: if self.test_video_classes[idx] != self.test_video_classes[idx - 1]: begin_new = True list_id = self.test_video_classes[idx] vid_set = self.video_ids[list_id] support_frames, support_masks = [], [] if begin_new: support_vid = random.sample(vid_set, self.support_frame) query_vids = [] for id in vid_set: if not id in support_vid: query_vids.append(id) self.query_ids = query_vids self.query_idx = -1 for i in range(self.support_frame): one_frame, one_mask = self.get_GT_byclass( support_vid[i], self.class_list[list_id], 1) support_frames += one_frame support_masks += one_mask self.query_idx += 1 query_vid = self.query_ids[self.query_idx] query_frames, query_masks = self.get_GT_byclass( query_vid, self.class_list[list_id], test=True) if self.transforms is not None: query_frames, query_masks = self.transforms( query_frames, query_masks) if begin_new: if self.another_transform is not None: support_frames, support_masks = self.another_transform( support_frames, support_masks) else: support_frames, support_masks = self.transforms( support_frames, support_masks) vid_info = self.vid_infos[query_vid] vid_name = vid_info['dir'] return query_frames, query_masks, support_frames, support_masks, self.class_list[ list_id], vid_name, begin_new def __getitem__(self, idx): if self.train: return self.__gettrainitem__(idx) else: return self.__gettestitem__(idx) def __len__(self): return self.length def get_class_list(self): return self.class_list
class YTVOSDataset(CustomDataset): CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard', 'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit', 'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle', 'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl', 'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant', 'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle', 'earless_seal', 'tennis_racket') def __init__(self, ann_file, img_prefix, img_scale, img_norm_cfg, size_divisor=None, proposal_file=None, num_max_proposals=1000, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True, with_track=False, extra_aug=None, aug_ref_bbox_param=None, resize_keep_ratio=True, test_mode=False, every_frame=False, is_flow=False, flow_test=False): # prefix of images path self.img_prefix = img_prefix # load annotations (and proposals) self.vid_infos = self.load_annotations(ann_file) self.every_frame = every_frame self.is_flow = is_flow self.flow_test = flow_test if self.flow_test or self.is_flow: self.cuda = True self.cuda = False if self.cuda: from mmcv import Config from mmdet.models import build_detector from mmcv.runner import load_checkpoint cfg = Config.fromfile( "../configs/masktrack_rcnn_r50_fpn_1x_flow_youtubevos.py") self.det_model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) load_checkpoint(self.det_model, "../results/20200312-180434/epoch_9.pth") self.det_model = self.det_model.cuda() self.det_model.eval() for param in self.det_model.parameters(): param.requires_grad = False # Set indexes for data loading img_ids = [] # training frames which have annotations img_ids_all = [] # all training frames img_ids_pairs = [] # flow data pairs for idx, vid_info in enumerate(self.vid_infos): vid_name = vid_info['filenames'][0].split('/')[0] folder_path = osp.join(self.img_prefix, vid_name) files = os.listdir(folder_path) files.sort() vid_info['filenames_all'] = [ osp.join(vid_name, file) for file in files ] for _id in range(len(files)): img_ids_all.append((idx, _id)) is_anno = vid_info['filenames_all'][_id] in vid_info[ 'filenames'] if is_anno and _id > 0: # having annotation and is not the first frame. ann_idx = vid_info['filenames'].index( vid_info['filenames_all'][_id]) ann = self.get_ann_info(idx, ann_idx) gt_bboxes = ann['bboxes'] # skip the image if there is no valid gt bbox if len(gt_bboxes) == 0: continue # random select key frame key_id = _id - np.random.randint(1, min(10, _id)) img_ids_pairs.append(((idx, key_id), (idx, _id))) for frame_id in range(len(vid_info['filenames'])): img_ids.append((idx, frame_id)) self.img_ids = img_ids self.img_ids_all = img_ids_all self.img_ids_pairs = img_ids_pairs if proposal_file is not None: self.proposals = self.load_proposals(proposal_file) else: self.proposals = None # filter images with no annotation during training if not test_mode: valid_inds = [ i for i, (v, f) in enumerate(self.img_ids) if len(self.get_ann_info(v, f)['bboxes']) ] self.img_ids = [self.img_ids[i] for i in valid_inds] # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...] self.img_scales = img_scale if isinstance(img_scale, list) else [img_scale] assert mmcv.is_list_of(self.img_scales, tuple) # normalization configs self.img_norm_cfg = img_norm_cfg # max proposals per image self.num_max_proposals = num_max_proposals # flip ratio self.flip_ratio = flip_ratio assert flip_ratio >= 0 and flip_ratio <= 1 # padding border to ensure the image size can be divided by # size_divisor (used for FPN) self.size_divisor = size_divisor # with mask or not (reserved field, takes no effect) self.with_mask = with_mask # some datasets provide bbox annotations as ignore/crowd/difficult, # if `with_crowd` is True, then these info is returned. self.with_crowd = with_crowd # with label is False for RPN self.with_label = with_label self.with_track = with_track # params for augmenting bbox in the reference frame self.aug_ref_bbox_param = aug_ref_bbox_param # in test mode or not self.test_mode = test_mode # set group flag for the sampler if not self.test_mode: self._set_group_flag() # transforms self.img_transform = ImageTransform(size_divisor=self.size_divisor, **self.img_norm_cfg) self.bbox_transform = BboxTransform() self.mask_transform = MaskTransform() self.numpy2tensor = Numpy2Tensor() # if use extra augmentation if extra_aug is not None: self.extra_aug = ExtraAugmentation(**extra_aug) else: self.extra_aug = None # image rescale if keep ratio self.resize_keep_ratio = resize_keep_ratio def __len__(self): if self.every_frame: return len(self.img_ids_all) elif self.is_flow: return len(self.img_ids_pairs) else: return len(self.img_ids) def __getitem__(self, idx): if self.test_mode: if self.every_frame: return self.prepare_test_img(self.img_ids_all[idx]) else: return self.prepare_test_img(self.img_ids[idx]) if self.is_flow: if self.flow_test: data = self.prepare_train_flow_test_img( self.img_ids_pairs[idx]) else: data = self.prepare_train_flow_img(self.img_ids_pairs[idx]) else: data = self.prepare_train_img(self.img_ids[idx]) return data def load_annotations(self, ann_file): self.ytvos = YTVOS(ann_file) self.cat_ids = self.ytvos.getCatIds() self.cat2label = { cat_id: i + 1 for i, cat_id in enumerate(self.cat_ids) } self.vid_ids = self.ytvos.getVidIds() vid_infos = [] for i in self.vid_ids: info = self.ytvos.loadVids([i])[0] info['filenames'] = info['file_names'] vid_infos.append(info) return vid_infos def get_ann_info(self, idx, frame_id): vid_id = self.vid_infos[idx]['id'] ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id]) ann_info = self.ytvos.loadAnns(ann_ids) return self._parse_ann_info(ann_info, frame_id) def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. """ self.flag = np.zeros(len(self), dtype=np.uint8) for i in range(len(self)): vid_id, _ = self.img_ids[i] vid_info = self.vid_infos[vid_id] if vid_info['width'] / vid_info['height'] > 1: self.flag[i] = 1 def bbox_aug(self, bbox, img_size): assert self.aug_ref_bbox_param is not None center_off = self.aug_ref_bbox_param[0] size_perturb = self.aug_ref_bbox_param[1] n_bb = bbox.shape[0] # bbox center offset center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off # bbox resize ratios resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1 # bbox: x1, y1, x2, y2 centers = (bbox[:, :2] + bbox[:, 2:]) / 2. sizes = bbox[:, 2:] - bbox[:, :2] new_centers = centers + center_offs * sizes new_sizes = sizes * resize_ratios new_x1y1 = new_centers - new_sizes / 2. new_x2y2 = new_centers + new_sizes / 2. c_min = [0, 0] c_max = [img_size[1], img_size[0]] new_x1y1 = np.clip(new_x1y1, c_min, c_max) new_x2y2 = np.clip(new_x2y2, c_min, c_max) bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32) return bbox def sample_ref(self, idx): # sample another frame in the same sequence as reference vid, frame_id = idx vid_info = self.vid_infos[vid] sample_range = range(len(vid_info['filenames'])) valid_samples = [] for i in sample_range: # check if the frame id is valid ref_idx = (vid, i) if i != frame_id and ref_idx in self.img_ids: valid_samples.append(ref_idx) assert len(valid_samples) > 0 return random.choice(valid_samples) def prepare_train_flow_test_img(self, idx): # prepare a pair of image in a sequence vid, key_frame_id = idx[0] _, cur_frame_id = idx[1] vid_info = self.vid_infos[vid] # load image key_img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id])) cur_img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id])) h_orig, w_orig, _ = key_img.shape basename = osp.basename(vid_info['filenames_all'][key_frame_id]) # apply transforms flip = True if np.random.rand() < self.flip_ratio else False img_scale = random_scale(self.img_scales) # sample a scale cur_img, img_shape, pad_shape, scale_factor = self.img_transform( cur_img, img_scale, flip, keep_ratio=self.resize_keep_ratio) if (type(scale_factor)) != float: scale_factor = tuple(scale_factor) cur_img = cur_img.copy() key_img, key_img_shape, _, ref_scale_factor = self.img_transform( key_img, img_scale, flip, keep_ratio=self.resize_keep_ratio) key_img = key_img.copy() # trans = torchvision.transforms.ToTensor() key_img = torch.from_numpy(key_img).cuda() cur_img = torch.from_numpy(cur_img).cuda() def resize(feat_map, size=(48, 64)): """Resize feature map to certain size.""" key_feature = torch.nn.functional.interpolate(feat_map, size, mode='bilinear', align_corners=True) return key_feature img_size = (384, 640) if key_img.shape[-2:] != img_size: key_img = resize(key_img.unsqueeze(0), img_size).squeeze(0) cur_img = resize(cur_img.unsqueeze(0), img_size).squeeze(0) key_feature_maps, _ = self.det_model.extract_feat(key_img.unsqueeze(0)) cur_feature_maps, _ = self.det_model.extract_feat(cur_img.unsqueeze(0)) key_feature_maps = [ feat_map.squeeze(0) for feat_map in key_feature_maps ] cur_feature_maps = [ feat_map.squeeze(0) for feat_map in cur_feature_maps ] data = dict(key_img=key_img, cur_img=cur_img, key_img_feats=key_feature_maps, cur_img_feats=cur_feature_maps) return data def prepare_train_flow_img(self, idx): # prepare a pair of image in a sequence vid, key_frame_id = idx[0] _, cur_frame_id = idx[1] vid_info = self.vid_infos[vid] # load image key_img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id])) cur_img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id])) h_orig, w_orig, _ = cur_img.shape basename = osp.basename(vid_info['filenames_all'][key_frame_id]) # load proposals if necessary if self.proposals is not None: proposals = self.proposals[idx][:self.num_max_proposals] # TODO: Handle empty proposals properly. Currently images with # no proposals are just ignored, but they can be used for # training in concept. if len(proposals) == 0: return None if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): raise AssertionError( 'proposals should have shapes (n, 4) or (n, 5), ' 'but found {}'.format(proposals.shape)) if proposals.shape[1] == 5: scores = proposals[:, 4, None] proposals = proposals[:, :4] else: scores = None ann_idx = vid_info['filenames'].index( vid_info['filenames_all'][cur_frame_id]) ann = self.get_ann_info(vid, ann_idx) gt_bboxes = ann['bboxes'] gt_labels = ann['labels'] if self.with_crowd: gt_bboxes_ignore = ann['bboxes_ignore'] # skip the image if there is no valid gt bbox if len(gt_bboxes) == 0: return None # extra augmentation if self.extra_aug is not None: cur_img, gt_bboxes, gt_labels = self.extra_aug( cur_img, gt_bboxes, gt_labels) # apply transforms flip = True if np.random.rand() < self.flip_ratio else False img_scales = [(1280, 720), (640, 360)] # img_scale = random_scale(self.img_scales) # sample a scale cur_img, img_shape, pad_shape, scale_factor = self.img_transform( cur_img, img_scales[1], flip, keep_ratio=self.resize_keep_ratio) if (type(scale_factor)) != float: scale_factor = tuple(scale_factor) cur_img = cur_img.copy() key_img, key_img_shape, _, key_scale_factor = self.img_transform( key_img, img_scales[0], flip, keep_ratio=self.resize_keep_ratio) key_img = key_img.copy() if self.proposals is not None: proposals = self.bbox_transform(proposals, img_shape, scale_factor, flip) proposals = np.hstack([proposals, scores ]) if scores is not None else proposals gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, flip) if self.with_crowd: gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, scale_factor, flip) if self.with_mask: if w_orig > h_orig: h, w = img_shape[0], img_shape[1] _scale_factor = tuple([w, h, w, h]) else: _scale_factor = scale_factor gt_masks = self.mask_transform(ann['masks'], pad_shape, _scale_factor, flip) ori_shape = (vid_info['height'], vid_info['width'], 3) img_meta = dict(ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, is_first=(cur_frame_id == 0), flip=flip) data = dict( img=DC(to_tensor(key_img), stack=True), ref_img=DC(to_tensor(cur_img), stack=True), img_meta=DC(img_meta, cpu_only=True), gt_bboxes=DC(to_tensor(gt_bboxes)), # ref_bboxes=DC(to_tensor(ref_bboxes)) ) if self.proposals is not None: data['proposals'] = DC(to_tensor(proposals)) if self.with_label: data['gt_labels'] = DC(to_tensor(gt_labels)) # if self.with_track: # data['gt_pids'] = DC(to_tensor(gt_pids)) if self.with_crowd: data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) if self.with_mask: data['gt_masks'] = DC(gt_masks, cpu_only=True) data['train_flow'] = True if self.cuda: key_img_cuda = torch.from_numpy(key_img).cuda() cur_img_cuda = torch.from_numpy(cur_img).cuda() def resize(feat_map, size=(48, 64)): """Resize feature map to certain size.""" key_feature = torch.nn.functional.interpolate( feat_map, size, mode='bilinear', align_corners=True) return key_feature img_size = (384, 640) if key_img_cuda.shape[-2:] != img_size: key_img_cuda = resize(key_img_cuda.unsqueeze(0), img_size).squeeze(0) cur_img_cuda = resize(cur_img_cuda.unsqueeze(0), img_size).squeeze(0) key_feature_maps, _ = self.det_model.extract_feat( key_img_cuda.unsqueeze(0)) cur_feature_maps, _ = self.det_model.extract_feat( cur_img_cuda.unsqueeze(0)) key_feature_maps = [ feat_map.squeeze(0) for feat_map in key_feature_maps ] cur_feature_maps = [ feat_map.squeeze(0) for feat_map in cur_feature_maps ] data['key_feature_maps'] = key_feature_maps data['cur_feature_maps'] = cur_feature_maps return data def prepare_train_img(self, idx): # prepare a pair of image in a sequence vid, frame_id = idx vid_info = self.vid_infos[vid] # load image if self.is_flow or self.every_frame: img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][frame_id])) else: img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames'][frame_id])) h_orig, w_orig, _ = img.shape basename = osp.basename(vid_info['filenames'][frame_id]) _, ref_frame_id = self.sample_ref(idx) ref_img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id])) # load proposals if necessary if self.proposals is not None: proposals = self.proposals[idx][:self.num_max_proposals] # TODO: Handle empty proposals properly. Currently images with # no proposals are just ignored, but they can be used for # training in concept. if len(proposals) == 0: return None if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): raise AssertionError( 'proposals should have shapes (n, 4) or (n, 5), ' 'but found {}'.format(proposals.shape)) if proposals.shape[1] == 5: scores = proposals[:, 4, None] proposals = proposals[:, :4] else: scores = None ann = self.get_ann_info(vid, frame_id) ref_ann = self.get_ann_info(vid, ref_frame_id) gt_bboxes = ann['bboxes'] gt_labels = ann['labels'] ref_bboxes = ref_ann['bboxes'] # obj ids attribute does not exist in current annotation # need to add it ref_ids = ref_ann['obj_ids'] gt_ids = ann['obj_ids'] # compute matching of reference frame with current frame # 0 denote there is no matching gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids] if self.with_crowd: gt_bboxes_ignore = ann['bboxes_ignore'] # skip the image if there is no valid gt bbox if len(gt_bboxes) == 0: return None # extra augmentation if self.extra_aug is not None: img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes, gt_labels) # apply transforms flip = True if np.random.rand() < self.flip_ratio else False img_scale = random_scale(self.img_scales) # sample a scale img, img_shape, pad_shape, scale_factor = self.img_transform( img, img_scale, flip, keep_ratio=self.resize_keep_ratio) if (type(scale_factor)) != float: scale_factor = tuple(scale_factor) img = img.copy() ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform( ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio) ref_img = ref_img.copy() if self.proposals is not None: proposals = self.bbox_transform(proposals, img_shape, scale_factor, flip) proposals = np.hstack([proposals, scores ]) if scores is not None else proposals gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, flip) ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape, ref_scale_factor, flip) if self.aug_ref_bbox_param is not None: ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape) if self.with_crowd: gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, scale_factor, flip) if self.with_mask: if w_orig > h_orig: h, w = img_shape[0], img_shape[1] _scale_factor = tuple([w, h, w, h]) else: _scale_factor = scale_factor gt_masks = self.mask_transform(ann['masks'], pad_shape, _scale_factor, flip) ori_shape = (vid_info['height'], vid_info['width'], 3) img_meta = dict(ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, is_first=(frame_id == 0), flip=flip) data = dict(img=DC(to_tensor(img), stack=True), ref_img=DC(to_tensor(ref_img), stack=True), img_meta=DC(img_meta, cpu_only=True), gt_bboxes=DC(to_tensor(gt_bboxes)), ref_bboxes=DC(to_tensor(ref_bboxes))) if self.proposals is not None: data['proposals'] = DC(to_tensor(proposals)) if self.with_label: data['gt_labels'] = DC(to_tensor(gt_labels)) if self.with_track: data['gt_pids'] = DC(to_tensor(gt_pids)) if self.with_crowd: data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) if self.with_mask: data['gt_masks'] = DC(gt_masks, cpu_only=True) return data def prepare_test_img(self, idx): """Prepare an image for testing (multi-scale and flipping)""" vid, frame_id = idx vid_info = self.vid_infos[vid] is_anno = True if self.every_frame: img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames_all'][frame_id])) is_anno = vid_info['filenames_all'][frame_id] in vid_info[ 'filenames'] else: img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames'][frame_id])) proposal = None if self.every_frame: file_name = vid_info['filenames_all'][frame_id] else: file_name = vid_info['filenames'][frame_id] def prepare_single(img, frame_id, scale, flip, file_name, proposal=None, is_anno=True): _img, img_shape, pad_shape, scale_factor = self.img_transform( img, scale, flip, keep_ratio=self.resize_keep_ratio) _img = to_tensor(_img) _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'], 3), img_shape=img_shape, pad_shape=pad_shape, is_first=(frame_id == 0), video_id=vid, file_name=file_name, frame_id=frame_id, scale_factor=scale_factor, flip=flip, is_anno=is_anno) if proposal is not None: if proposal.shape[1] == 5: score = proposal[:, 4, None] proposal = proposal[:, :4] else: score = None _proposal = self.bbox_transform(proposal, img_shape, scale_factor, flip) _proposal = np.hstack([_proposal, score ]) if score is not None else _proposal _proposal = to_tensor(_proposal) else: _proposal = None return _img, _img_meta, _proposal imgs = [] img_metas = [] proposals = [] for scale in self.img_scales: _img, _img_meta, _proposal = prepare_single( img, frame_id, scale, False, file_name, proposal, is_anno) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) if self.flip_ratio > 0: _img, _img_meta, _proposal = prepare_single( img, scale, True, file_name, proposal, is_anno) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) data = dict(img=imgs, img_meta=img_metas) return data def _parse_ann_info(self, ann_info, frame_id, with_mask=True): """Parse bbox and mask annotation. Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore, labels, masks, mask_polys, poly_lens. """ gt_bboxes = [] gt_labels = [] gt_ids = [] gt_bboxes_ignore = [] # Two formats are provided. # 1. mask: a binary map of the same size of the image. # 2. polys: each mask consists of one or several polys, each poly is a # list of float. if with_mask: gt_masks = [] gt_mask_polys = [] gt_poly_lens = [] for i, ann in enumerate(ann_info): # each ann is a list of masks # ann: # bbox: list of bboxes # segmentation: list of segmentation # category_id # area: list of area bbox = ann['bboxes'][frame_id] area = ann['areas'][frame_id] segm = ann['segmentations'][frame_id] if bbox is None: continue x1, y1, w, h = bbox if area <= 0 or w < 1 or h < 1: continue bbox = [x1, y1, x1 + w - 1, y1 + h - 1] if ann['iscrowd']: gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_ids.append(ann['id']) gt_labels.append(self.cat2label[ann['category_id']]) if with_mask: gt_masks.append(self.ytvos.annToMask(ann, frame_id)) mask_polys = [ p for p in segm if len(p) >= 6 ] # valid polygons have >= 3 points (6 coordinates) poly_lens = [len(p) for p in mask_polys] gt_mask_polys.append(mask_polys) gt_poly_lens.extend(poly_lens) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) ann = dict(bboxes=gt_bboxes, labels=gt_labels, obj_ids=gt_ids, bboxes_ignore=gt_bboxes_ignore) if with_mask: ann['masks'] = gt_masks # poly format is not used in the current implementation ann['mask_polys'] = gt_mask_polys ann['poly_lens'] = gt_poly_lens return ann
class YTVOSDatasetTASUF(CustomDataset): CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard', 'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit', 'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle', 'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl', 'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant', 'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle', 'earless_seal', 'tennis_racket') def __init__(self, ann_file, img_prefix, img_scale, img_norm_cfg, size_divisor=None, proposal_file=None, num_max_proposals=1000, flip_ratio=0, with_mask=True, with_crowd=True, with_label=True, with_track=False, extra_aug=None, aug_ref_bbox_param=None, resize_keep_ratio=True, test_mode=False): self.max_gap = 3 # prefix of images path self.img_prefix = img_prefix # load annotations (and proposals) self.vid_infos = self.load_annotations(ann_file) img_ids = [] for idx, vid_info in enumerate(self.vid_infos): for frame_id in range(len(vid_info['filenames'])): img_ids.append((idx, frame_id)) self.img_ids = img_ids if proposal_file is not None: self.proposals = self.load_proposals(proposal_file) else: self.proposals = None # filter images with no annotation during training if not test_mode: valid_inds = [ i for i, (v, f) in enumerate(self.img_ids) if len(self.get_ann_info(v, f)['bboxes']) ] self.img_ids = [self.img_ids[i] for i in valid_inds] # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...] self.img_scales = img_scale if isinstance(img_scale, list) else [img_scale] assert mmcv.is_list_of(self.img_scales, tuple) # normalization configs self.img_norm_cfg = img_norm_cfg # max proposals per image self.num_max_proposals = num_max_proposals # flip ratio self.flip_ratio = flip_ratio assert flip_ratio >= 0 and flip_ratio <= 1 # padding border to ensure the image size can be divided by # size_divisor (used for FPN) self.size_divisor = size_divisor # with mask or not (reserved field, takes no effect) self.with_mask = with_mask # some datasets provide bbox annotations as ignore/crowd/difficult, # if `with_crowd` is True, then these info is returned. self.with_crowd = with_crowd # with label is False for RPN self.with_label = with_label self.with_track = with_track # params for augmenting bbox in the reference frame self.aug_ref_bbox_param = aug_ref_bbox_param # in test mode or not self.test_mode = test_mode # set group flag for the sampler if not self.test_mode: self._set_group_flag() # transforms self.img_transform = ImageTransform(size_divisor=self.size_divisor, **self.img_norm_cfg) self.bbox_transform = BboxTransform() self.mask_transform = MaskTransform() self.numpy2tensor = Numpy2Tensor() # if use extra augmentation if extra_aug is not None: self.extra_aug = ExtraAugmentation(**extra_aug) else: self.extra_aug = None # image rescale if keep ratio self.resize_keep_ratio = resize_keep_ratio # [JH] self.max_bboxes_per_frame = 0 def __len__(self): return len(self.img_ids) def __getitem__(self, idx): if self.test_mode: return self.prepare_test_img(self.img_ids[idx]) data = self.prepare_train_img(self.img_ids[idx]) while data == None: try: data = self.prepare_train_img(self.img_ids[idx + 1]) except: data = self.prepare_train_img(self.img_ids[idx - 1]) return data def load_annotations(self, ann_file): self.ytvos = YTVOS(ann_file) self.cat_ids = self.ytvos.getCatIds() self.cat2label = { cat_id: i + 1 for i, cat_id in enumerate(self.cat_ids) } self.vid_ids = self.ytvos.getVidIds() vid_infos = [] for i in self.vid_ids: info = self.ytvos.loadVids([i])[0] info['filenames'] = info['file_names'] vid_infos.append(info) return vid_infos def get_ann_info(self, idx, frame_id): vid_id = self.vid_infos[idx]['id'] ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id]) ann_info = self.ytvos.loadAnns(ann_ids) return self._parse_ann_info(ann_info, frame_id) def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. """ self.flag = np.zeros(len(self), dtype=np.uint8) for i in range(len(self)): vid_id, _ = self.img_ids[i] vid_info = self.vid_infos[vid_id] if vid_info['width'] / vid_info['height'] > 1: self.flag[i] = 1 def bbox_aug(self, bbox, img_size): assert self.aug_ref_bbox_param is not None center_off = self.aug_ref_bbox_param[0] size_perturb = self.aug_ref_bbox_param[1] n_bb = bbox.shape[0] # bbox center offset center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off # bbox resize ratios resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1 # bbox: x1, y1, x2, y2 centers = (bbox[:, :2] + bbox[:, 2:]) / 2. sizes = bbox[:, 2:] - bbox[:, :2] new_centers = centers + center_offs * sizes new_sizes = sizes * resize_ratios new_x1y1 = new_centers - new_sizes / 2. new_x2y2 = new_centers + new_sizes / 2. c_min = [0, 0] c_max = [img_size[1], img_size[0]] new_x1y1 = np.clip(new_x1y1, c_min, c_max) new_x2y2 = np.clip(new_x2y2, c_min, c_max) bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32) return bbox def sample_ref(self, idx): # sample another frame in the same sequence as reference vid, frame_id = idx vid_info = self.vid_infos[vid] sample_range = range(len(vid_info['filenames'])) valid_samples = [] for i in sample_range: # check if the frame id is valid ref_idx = (vid, i) if i != frame_id and ref_idx in self.img_ids: valid_samples.append(ref_idx) assert len(valid_samples) > 0 return random.choice(valid_samples) # sampling req sequence for TASUF # sequence length from 1 to 8 # sequence direction => backward or forward def sample_ref_seq(self, idx): vid, frame_id = idx vid_info = self.vid_infos[vid] vid_len = len(vid_info['filenames']) seq_len = random.randint(1, 8) if frame_id < seq_len: valid_samples = self.sample_ref_range(frame_id, vid_len, seq_len, backward=True) elif frame_id + seq_len > vid_len: valid_samples = self.sample_ref_range(0, frame_id, seq_len, backward=False) else: if random.random() < 0.5: valid_samples = self.sample_ref_range(frame_id, vid_len, seq_len, backward=True) else: valid_samples = self.sample_ref_range(0, frame_id, seq_len, backward=False) return valid_samples def sample_ref_range(self, start, end, seq_len, backward=False): sample_range = list(range(start, end)) while len(sample_range) < seq_len: sample_range *= 2 valid_samples = random.sample(sample_range, seq_len) valid_samples.sort() # [JW] for i, v in enumerate(valid_samples[:-1]): if valid_samples[i + 1] - v > self.max_gap: gap_modulation = valid_samples[i + 1] - v - self.max_gap for j in range(i + 1, len(valid_samples)): valid_samples[j] -= gap_modulation if backward: valid_samples.reverse() return valid_samples def prepare_train_img(self, idx): # prepare a pair of image in a sequence vid, frame_id = idx vid_info = self.vid_infos[vid] # load image img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames'][frame_id])) basename = osp.basename(vid_info['filenames'][frame_id]) ref_frame_id_seq = self.sample_ref_seq(idx) ref_img_seq =\ [mmcv.imread(osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id])) for ref_frame_id in ref_frame_id_seq] # load proposals if necessary if self.proposals is not None: proposals = self.proposals[idx][:self.num_max_proposals] # TODO: Handle empty proposals properly. Currently images with # no proposals are just ignored, but they can be used for # training in concept. if len(proposals) == 0: return None if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): raise AssertionError( 'proposals should have shapes (n, 4) or (n, 5), ' 'but found {}'.format(proposals.shape)) if proposals.shape[1] == 5: scores = proposals[:, 4, None] proposals = proposals[:, :4] else: scores = None ann = self.get_ann_info(vid, frame_id) ref_ann_seq =\ [self.get_ann_info(vid, ref_frame_id) for ref_frame_id in ref_frame_id_seq] gt_bboxes = ann['bboxes'] gt_labels = ann['labels'] ref_bboxes_seq = [] for i, ref_ann in enumerate(ref_ann_seq): ref_bboxes = ref_ann['bboxes'] if len(ref_bboxes) == 0: return None ref_bboxes_seq.append(ref_bboxes) # obj ids attribute does not exist in current annotation # need to add it ref_ids_seq = [ref_ann['obj_ids'] for ref_ann in ref_ann_seq] gt_ids = ann['obj_ids'] # compute matching of reference frame with current frame # 0 denote there is no matching id_set = set() for ref_ids in ref_ids_seq: id_set = id_set.union(set(ref_ids)) id_set = sorted(list(id_set)) gt_pids_seq = [] for ref_ids in ref_ids_seq: gt_pids_seq.append([id_set.index(i) + 1 for i in ref_ids]) gt_pids_seq.append( [id_set.index(i) + 1 if i in id_set else 0 for i in gt_ids]) if self.with_crowd: gt_bboxes_ignore = ann['bboxes_ignore'] # skip the image if there is no valid gt bbox if len(gt_bboxes) == 0: return None # extra augmentation if self.extra_aug is not None: img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes, gt_labels) # apply transforms flip = True if np.random.rand() < self.flip_ratio else False img_scale = random_scale(self.img_scales) # sample a scale img, img_shape, pad_shape, scale_factor = self.img_transform( img, img_scale, flip, keep_ratio=self.resize_keep_ratio) img = img.copy() for i, ref_img in enumerate(ref_img_seq): ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform( ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio) ref_img = ref_img.copy() ref_img_seq[i] = ref_img if self.proposals is not None: proposals = self.bbox_transform(proposals, img_shape, scale_factor, flip) proposals = np.hstack([proposals, scores ]) if scores is not None else proposals gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, flip) for i, ref_bboxes in enumerate(ref_bboxes_seq): ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape, ref_scale_factor, flip) ref_bboxes_seq[i] = ref_bboxes if self.aug_ref_bbox_param is not None: for i, ref_bboxes in enumerate(ref_bboxes_seq): ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape) ref_bboxes_seq[i] = ref_bboexs if self.with_crowd: gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, scale_factor, flip) if self.with_mask: gt_masks = self.mask_transform(ann['masks'], pad_shape, scale_factor, flip) ori_shape = (vid_info['height'], vid_info['width'], 3) img_meta = dict(ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, flip=flip) ref_img_DC_seq = [] for ref_img in ref_img_seq: ref_img_DC_seq.append(DC(to_tensor(ref_img), stack=True)) ref_bboxes_DC_seq = [] for ref_bboxes in ref_bboxes_seq: ref_bboxes_DC_seq.append(DC(to_tensor(ref_bboxes))) data = dict(img=DC(to_tensor(img), stack=True), ref_img=ref_img_DC_seq, img_meta=DC(img_meta, cpu_only=True), gt_bboxes=DC(to_tensor(gt_bboxes)), ref_bboxes=ref_bboxes_DC_seq) if self.proposals is not None: data['proposals'] = DC(to_tensor(proposals)) if self.with_label: data['gt_labels'] = DC(to_tensor(gt_labels)) if self.with_track: gt_pids_DC_seq = [] for gt_pids in gt_pids_seq: gt_pids_DC_seq.append(DC(to_tensor(gt_pids))) data['gt_pids'] = gt_pids_DC_seq if self.with_crowd: data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) if self.with_mask: data['gt_masks'] = DC(gt_masks, cpu_only=True) return data def prepare_test_img(self, idx): """Prepare an image for testing (multi-scale and flipping)""" vid, frame_id = idx vid_info = self.vid_infos[vid] img = mmcv.imread( osp.join(self.img_prefix, vid_info['filenames'][frame_id])) proposal = None def prepare_single(img, frame_id, scale, flip, proposal=None): _img, img_shape, pad_shape, scale_factor = self.img_transform( img, scale, flip, keep_ratio=self.resize_keep_ratio) _img = to_tensor(_img) _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'], 3), img_shape=img_shape, pad_shape=pad_shape, is_first=(frame_id == 0), video_id=vid, frame_id=frame_id, scale_factor=scale_factor, flip=flip) if proposal is not None: if proposal.shape[1] == 5: score = proposal[:, 4, None] proposal = proposal[:, :4] else: score = None _proposal = self.bbox_transform(proposal, img_shape, scale_factor, flip) _proposal = np.hstack([_proposal, score ]) if score is not None else _proposal _proposal = to_tensor(_proposal) else: _proposal = None return _img, _img_meta, _proposal imgs = [] img_metas = [] proposals = [] for scale in self.img_scales: _img, _img_meta, _proposal = prepare_single( img, frame_id, scale, False, proposal) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) if self.flip_ratio > 0: _img, _img_meta, _proposal = prepare_single( img, scale, True, proposal) imgs.append(_img) img_metas.append(DC(_img_meta, cpu_only=True)) proposals.append(_proposal) data = dict(img=imgs, img_meta=img_metas) return data def _parse_ann_info(self, ann_info, frame_id, with_mask=True): """Parse bbox and mask annotation. Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore, labels, masks, mask_polys, poly_lens. """ gt_bboxes = [] gt_labels = [] gt_ids = [] gt_bboxes_ignore = [] # Two formats are provided. # 1. mask: a binary map of the same size of the image. # 2. polys: each mask consists of one or several polys, each poly is a # list of float. if with_mask: gt_masks = [] gt_mask_polys = [] gt_poly_lens = [] for i, ann in enumerate(ann_info): # each ann is a list of masks # ann: # bbox: list of bboxes # segmentation: list of segmentation # category_id # area: list of area bbox = ann['bboxes'][frame_id] area = ann['areas'][frame_id] segm = ann['segmentations'][frame_id] if bbox is None: continue x1, y1, w, h = bbox if area <= 0 or w < 1 or h < 1: continue bbox = [x1, y1, x1 + w - 1, y1 + h - 1] if ann['iscrowd']: gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_ids.append(ann['id']) gt_labels.append(self.cat2label[ann['category_id']]) if with_mask: gt_masks.append(self.ytvos.annToMask(ann, frame_id)) mask_polys = [ p for p in segm if len(p) >= 6 ] # valid polygons have >= 3 points (6 coordinates) poly_lens = [len(p) for p in mask_polys] gt_mask_polys.append(mask_polys) gt_poly_lens.extend(poly_lens) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) ann = dict(bboxes=gt_bboxes, labels=gt_labels, obj_ids=gt_ids, bboxes_ignore=gt_bboxes_ignore) if with_mask: ann['masks'] = gt_masks # poly format is not used in the current implementation ann['mask_polys'] = gt_mask_polys ann['poly_lens'] = gt_poly_lens return ann
class YoutubeVIS(data.Dataset): """`YoutubeVIS <https://youtube-vos.org/dataset/vis/>`_ Dataset. Args: root (string): Root directory where images are downloaded to. set_name (string): Name of the specific set of COCO images. transform (callable, optional): A function/transform that augments the raw images` target_transform (callable, optional): A function/transform that takes in the target (bbox) and transforms it. prep_crowds (bool): Whether or not to prepare crowds for the evaluation step. """ def __init__(self, image_path, info_file, configs, transform=None, target_transform=YoutubeVISAnnotationTransform(), dataset_name='YouTube VIS', has_gt=True): # Do this here because we have too many things named COCO from pycocotools.ytvos import YTVOS self.root = image_path self.configs = configs logger = logging.getLogger("yolact.dataset") logger.info('Loading annotations into memory...') tic = time.time() with contextlib.redirect_stdout(io.StringIO()): self.coco = YTVOS(info_file) self.ids = list(self.coco.vidToAnns.keys()) if len(self.ids) == 0 or not has_gt: self.ids = list(self.coco.vids.keys()) logger.info('{} videos loaded in {:0.2f}s.'.format( len(self.ids), time.time() - tic)) self.transform = transform self.target_transform = target_transform self.name = dataset_name self.has_gt = has_gt def __getitem__(self, index): """ Args: index (int): Index Returns: tuple: Tuple (image, (target, masks, num_crowds)). target is the object returned by ``coco.loadAnns``. """ video_frames, extra_data = self.pull_video(index) video_frames = [( im, (gt, masks, num_crowds), ) for im, gt, masks, h, w, num_crowds in video_frames] return video_frames, extra_data def pull_video(self, index, return_on_failure=False, full_video=False, max_images=-1): """ Args: index (int): Index Returns: tuple: Tuple (image, target, masks, height, width, crowd). target is the object returned by ``coco.loadAnns``. Note that if no crowd annotations exist, crowd will be None """ vid_id = self.ids[index] seq_len = self.configs.images_per_video # sample vid_id with enough length while True: vid = self.coco.loadVids(vid_id)[0] annot_length = len(vid['file_names']) if not full_video and annot_length < seq_len: continue # FIXME: need to set new vid_id right? vid_name = vid['file_names'][0].split('/')[0] # Generate target starts. if self.has_gt: target = self.coco.vidToAnns[vid_id] ann_ids = self.coco.getAnnIds(vidIds=vid_id) # Target has {'segmentation', 'area', iscrowd', 'image_id', 'bboxes', 'category_id'} target = self.coco.loadAnns(ann_ids) else: target = [] # Separate out crowd annotations. These are annotations that signify a large crowd of # objects of said class, where there is no annotation for each individual object. Both # during testing and training, consider these crowds as neutral. crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])] target = [ x for x in target if not ('iscrowd' in x and x['iscrowd']) ] num_crowds = len(crowd) for x in crowd: x['category_id'] = -1 # This is so we ensure that all crowd annotations are at the end of the array target += crowd # Generate target ends. # shuffling and sample a small range of video here if full_video: annot_idx = np.arange(0, annot_length, 1) frame_idx = np.asarray([ int(vid['file_names'][idx][-9:-4]) for idx in range(annot_length) ]) if self.configs.use_all_frames: key_frame_idx = frame_idx frame_idx = np.arange(frame_idx[0], frame_idx[-1] + 1, 1) have_annot = np.asarray( [int(idx in key_frame_idx) for idx in frame_idx]) annot_idx = np.add.accumulate(have_annot) * have_annot - 1 if max_images != -1: eval_frames = min(max_images, len(frame_idx)) # start_idx = np.random.randint(0, len(frame_idx) - eval_frames + 1) start_idx = 0 frame_idx = frame_idx[start_idx:start_idx + eval_frames] annot_idx = annot_idx[start_idx:start_idx + eval_frames] elif self.configs.use_all_frames: rand_idx = np.arange(0, annot_length - seq_len) np.random.shuffle(rand_idx) direction = 1 if self.configs.all_frame_direction == 'allway': if np.random.rand() > 0.5: direction *= -1 elif self.configs.all_frame_direction == 'forward': # Note: forward warping needs to sample a 'previous frame' direction *= -1 elif self.configs.all_frame_direction == 'backward': pass else: raise ValueError("Unexpected frame direction: %s" % self.configs.all_frame_direction) start_idx = rand_idx[0] if direction < 0: start_idx += self.configs.images_per_video start_frame_idx = int(vid['file_names'][start_idx][-9:-4]) annot_idx = [start_idx] frame_idx = [start_frame_idx] # if self.configs.images_per_video > 1: # num_extra_frames = self.configs.images_per_video - 1 # extra_annot_idx = [start_idx + direction * offset_idx # for offset_idx in range(1, num_extra_frames + 1)] # extra_frame_idx = [int(vid['file_names'][extra_idx][-9:-4]) # for extra_idx in extra_annot_idx] # # annot_idx += extra_annot_idx # frame_idx += extra_frame_idx extra_frame_idx = [] extra_annot_idx = [] if self.configs.images_per_video > 0: offset_lb, offset_ub = self.configs.frame_offset_lb, self.configs.frame_offset_ub lb, ub = int(vid['file_names'][0][-9:-4]), int( vid['file_names'][-1][-9:-4]) fidx = frame_idx[-1] lb, ub = lb - fidx, ub - fidx if direction == -1: ub = -offset_lb lb = max(lb, -offset_ub) else: lb = offset_lb ub = min(ub, offset_ub) assert lb <= ub + 1, "{}, {}".format(lb, ub) assert self.configs.frame_offset_multiplier == 1, "frame_offset_multiplier deprecated." for _ in range(self.configs.images_per_video): frame_diff = np.random.randint(lb, ub + 1) ref_idx = fidx + frame_diff assert int( vid['file_names'][0][-9:-4]) <= ref_idx <= int( vid['file_names'][-1] [-9:-4]), "{} <= {} <= {}".format( int(vid['file_names'][0][-9:-4]), ref_idx, int(vid['file_names'][-1][-9:-4])) # frame_diff = self.configs.frame_offset_multiplier * np.random.randint(self.configs.frame_offset_lb, self.configs.frame_offset_ub + 1) # ref_idx = np.clip(frame_idx[-1] + frame_diff * direction, # int(vid['file_names'][0][-9:-4]), int(vid['file_names'][-1][-9:-4])) extra_frame_idx += [ref_idx] extra_annot_idx += [-1] extra_frame_idx = list(sorted(extra_frame_idx, reverse=True)) annot_idx += extra_annot_idx frame_idx += extra_frame_idx annot_idx = np.asarray(annot_idx) frame_idx = np.asarray(frame_idx) else: rand_idx = np.arange(0, annot_length - seq_len + 1) np.random.shuffle(rand_idx) start_idx = rand_idx[0] annot_idx = np.arange(start_idx, start_idx + seq_len, 1) frame_idx = np.asarray( [int(vid['file_names'][idx][-9:-4]) for idx in annot_idx]) has_targets = all([ self.target_in_frame(target, annot_id, true_on_reference=True) for annot_id in annot_idx ]) if has_targets: break if return_on_failure: return None # print("Not all frame of video %s[%d-%d] has targets, re-selecting video." % # (vid['file_names'][0].split('/')[0], start_idx, start_idx + frm_len)) index = np.random.randint(len(self)) vid_id = self.ids[index] frame_results = [] extra_data = [] while True: try: for idx, (frame_id, annot_id) in enumerate( zip(frame_idx.tolist(), annot_idx.tolist())): extra = {} # FIXME: little bit hacky for full frames, maybe fix this using annotation files frame_id_str = "%05d" % frame_id file_name = vid['file_names'][0] file_name = file_name[:-9] + frame_id_str + file_name[-4:] prev_frame_id = frame_idx[idx - 1] if idx > 0 else -1 prev_annot_id = annot_idx[idx - 1] if idx > 0 else -1 if idx == 0: seeds, (im, gt, masks, h, w, num_crowds) = self.pull_frame( vid_name, (frame_id, annot_id), (prev_frame_id, prev_annot_id), file_name, target, num_crowds, require_seeds=True) else: im, gt, masks, h, w, num_crowds = self.pull_frame( vid_name, (frame_id, annot_id), (prev_frame_id, prev_annot_id), file_name, target, num_crowds, seeds=seeds) extra['idx'] = ( frame_id, annot_id, ) frame_results.append(( im, gt, masks, h, w, num_crowds, )) extra_data.append(extra) except ValueError as e: logger = logging.getLogger("yolact.dataset") logger.warning('Resampling with reseed signal...') frame_results.clear() extra_data.clear() continue break return frame_results, extra_data def __len__(self): return len(self.ids) @staticmethod def target_in_frame(target, frame_id, true_on_reference=False): if frame_id < 0: return true_on_reference if len(target) > 0: for obj in target: if obj['segmentations'][frame_id] is not None: return True return False def pull_frame(self, vid_name, frame_annot_id, prev_frame_annot_id, file_name, target, num_crowds, require_seeds=False, seeds=None): frame_id, annot_id = frame_annot_id prev_frame_id, prev_annot_id = prev_frame_annot_id path = osp.join(self.root, file_name) assert osp.exists(path), 'Image path does not exist: {}'.format(path) img = cv2.imread(path) height, width, _ = img.shape target_is_in_frame = self.target_in_frame(target, annot_id) if target_is_in_frame: # Pool all the masks for this image into one [num_objects,height,width] matrix # masks = [np.zeros(height * width, dtype=np.uint8).reshape(-1) if obj['segmentations'][frame_id] is None # all-zero mask on None # else self.coco.annToMask(obj, frame_id).reshape(-1) for obj in target] masks = [ self.coco.annToMask(obj, annot_id).reshape(-1) for obj in target if obj['segmentations'][annot_id] is not None ] masks = np.vstack(masks) masks = masks.reshape(-1, height, width) if self.target_transform is not None and target_is_in_frame: target = self.target_transform(target, annot_id, width, height) if self.transform is not None: if "Video" in type(self.transform).__name__: if target_is_in_frame: target = np.array(target) return_transform = self.transform( img, masks, target[:, :4], { 'num_crowds': num_crowds, 'labels': target[:, 4] }, require_seeds=require_seeds, seeds=seeds) if require_seeds: seeds, (img, masks, boxes, labels) = return_transform else: img, masks, boxes, labels = return_transform # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations num_crowds = labels['num_crowds'] labels = labels['labels'] target = np.hstack((boxes, np.expand_dims(labels, axis=1))) if target.shape[0] == 0: logger = logging.getLogger("yolact.dataset") logger.warning( 'Augmentation output an example with no ground truth. Resampling...' ) raise ValueError("reseed") else: try: return_transform = self.transform( img, np.zeros((1, height, width), dtype=np.float), np.array([[0., 0., 1., 1.]]), { 'num_crowds': 0, 'labels': np.array([0]) }, require_seeds=require_seeds, seeds=seeds) except ValueError: assert False, "Unexpected reseed captured with no-target instances." if require_seeds: seeds, (img, _, _, _) = return_transform else: img, _, _, _ = return_transform masks = None target = None else: if target_is_in_frame: target = np.array(target) img, masks, boxes, labels = self.transform( img, masks, target[:, :4], { 'num_crowds': num_crowds, 'labels': target[:, 4] }) # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations num_crowds = labels['num_crowds'] labels = labels['labels'] target = np.hstack((boxes, np.expand_dims(labels, axis=1))) else: img, _, _, _ = self.transform( img, np.zeros((1, height, width), dtype=np.float), np.array([[0, 0, 1, 1]]), { 'num_crowds': 0, 'labels': np.array([0]) }) masks = None target = None return_tuple = torch.from_numpy(img).permute( 2, 0, 1), target, masks, height, width, num_crowds if require_seeds: return seeds, return_tuple else: return return_tuple def pull_image(self, index): '''Returns the original image object at index in PIL form Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. Argument: index (int): index of img to show Return: cv2 img ''' img_id = self.ids[index] path = self.coco.loadImgs(img_id)[0]['file_name'] return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR) def pull_anno(self, index): '''Returns the original annotation of image at index Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. Argument: index (int): index of img to get annotation of Return: list: [img_id, [(label, bbox coords),...]] eg: ('001718', [('dog', (96, 13, 438, 332))]) ''' img_id = self.ids[index] ann_ids = self.coco.getAnnIds(imgIds=img_id) return self.coco.loadAnns(ann_ids) def __repr__(self): fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) fmt_str += ' Root Location: {}\n'.format(self.root) tmp = ' Transforms (if any): ' fmt_str += '{0}{1}\n'.format( tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) tmp = ' Target Transforms (if any): ' fmt_str += '{0}{1}'.format( tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) return fmt_str