Пример #1
0
class YTVOSDataset(Dataset):
    def __init__(self,
                 data_path=None,
                 train=True,
                 valid=False,
                 set_index=1,
                 finetune_idx=None,
                 support_frame=5,
                 query_frame=1,
                 sample_per_class=10,
                 transforms=None,
                 another_transform=None):
        self.train = train
        self.valid = valid
        self.set_index = set_index
        self.support_frame = support_frame
        self.query_frame = query_frame
        self.sample_per_class = sample_per_class
        self.transforms = transforms
        self.another_transform = another_transform

        if data_path is None:
            data_path = os.path.join(os.path.expanduser('~'), 'Lab/DANet')
        data_dir = os.path.join(data_path, 'data', 'Youtube-VOS')
        self.img_dir = os.path.join(data_dir, 'train', 'JPEGImages')
        self.ann_file = os.path.join(data_dir, 'train', 'train.json')

        self.load_annotations()

        print('data set index: ', set_index)
        self.train_list = [
            n + 1 for n in range(40) if n % 4 != (set_index - 1)
        ]
        self.valid_list = [
            n + 1 for n in range(40) if n % 4 == (set_index - 1)
        ]

        if train and not valid:
            self.class_list = self.train_list
        else:
            self.class_list = self.valid_list
        if finetune_idx is not None:
            self.class_list = [self.class_list[finetune_idx]]

        self.video_ids = []
        for class_id in self.class_list:
            tmp_list = self.ytvos.getVidIds(catIds=class_id)
            tmp_list.sort()
            self.video_ids.append(tmp_list)  # list[list[video_id]]
        if not self.train:
            self.test_video_classes = []
            for i in range(len(self.class_list)):
                for j in range(len(self.video_ids[i]) -
                               support_frame):  # remove the support set
                    self.test_video_classes.append(i)

        if self.train:
            self.length = len(self.class_list) * sample_per_class
        else:
            self.length = len(self.test_video_classes)  # test

    def load_annotations(self):
        self.ytvos = YTVOS(self.ann_file)
        self.vid_ids = self.ytvos.getVidIds()  # list[2238] begin : 1
        self.vid_infos = self.ytvos.vids  # vids
        for vid, vid_info in self.vid_infos.items():  # for each vid
            vid_name = vid_info['file_names'][0].split('/')[0]  # '0043f083b5'
            vid_info['dir'] = vid_name
            frame_len = vid_info['length']  # int
            frame_object, frame_class = [], []
            for i in range(frame_len):
                frame_object.append([])
            for i in range(frame_len):
                frame_class.append([])
            category_set = set()
            annos = self.ytvos.vidToAnns[vid]  # list[]
            for anno in annos:  # instance_level anns
                assert len(anno['segmentations']) == frame_len, (
                    vid_name, len(anno['segmentations']), vid_info['length'])
                for frame_idx in range(frame_len):
                    anno_segmentation = anno['segmentations'][frame_idx]
                    if anno_segmentation is not None:
                        frame_object[frame_idx].append(
                            anno['id'])  # add instance to vid_frame
                        frame_class[frame_idx].append(
                            anno['category_id']
                        )  # add instance class to vid_frame
                        category_set = category_set.union(
                            {anno['category_id']})
            vid_info['objects'] = frame_object
            vid_info['classes'] = frame_class
            class_frame_id = dict()
            for class_id in category_set:  # frames index for each class
                class_frame_id[class_id] = [
                    i for i in range(frame_len) if class_id in frame_class[i]
                ]
            vid_info['class_frames'] = class_frame_id

    def get_GT_byclass(self, vid, class_id, frame_num=1, test=False):
        vid_info = self.vid_infos[vid]
        frame_list = vid_info['class_frames'][class_id]
        frame_len = len(frame_list)
        choice_frame = random.sample(frame_list, 1)
        if test:
            frame_num = frame_len
        if frame_num > 1:
            if frame_num <= frame_len:
                choice_idx = frame_list.index(choice_frame[0])
                if choice_idx < frame_num:
                    begin_idx = 0
                    end_idx = frame_num
                else:
                    begin_idx = choice_idx - frame_num + 1
                    end_idx = choice_idx + 1
                choice_frame = [
                    frame_list[n] for n in range(begin_idx, end_idx)
                ]
            else:
                choice_frame = []
                for i in range(frame_num):
                    if i < frame_len:
                        choice_frame.append(frame_list[i])
                    else:
                        choice_frame.append(frame_list[frame_len - 1])
        frames = [
            np.array(
                Image.open(
                    os.path.join(self.img_dir,
                                 vid_info['file_names'][frame_idx])))
            for frame_idx in choice_frame
        ]
        masks = []
        for frame_id in choice_frame:
            object_ids = vid_info['objects'][frame_id]
            mask = None
            for object_id in object_ids:
                ann = self.ytvos.loadAnns(object_id)[0]
                if ann['category_id'] not in self.class_list:
                    continue
                track_id = 1
                if ann['category_id'] != class_id:
                    track_id = 0
                temp_mask = self.ytvos.annToMask(ann, frame_id)
                if mask is None:
                    mask = temp_mask * track_id
                else:
                    mask += temp_mask * track_id

            assert mask is not None
            mask[mask > 0] = 1
            masks.append(mask)

        return frames, masks

    def __gettrainitem__(self, idx):
        list_id = idx // self.sample_per_class
        vid_set = self.video_ids[list_id]

        query_vid = random.sample(vid_set, 1)
        support_vid = random.sample(vid_set, self.support_frame)

        query_frames, query_masks = self.get_GT_byclass(
            query_vid[0], self.class_list[list_id], self.query_frame)

        support_frames, support_masks = [], []
        for i in range(self.support_frame):
            one_frame, one_mask = self.get_GT_byclass(support_vid[i],
                                                      self.class_list[list_id],
                                                      1)
            support_frames += one_frame
            support_masks += one_mask

        if self.transforms is not None:
            query_frames, query_masks = self.transforms(
                query_frames, query_masks)
            support_frames, support_masks = self.transforms(
                support_frames, support_masks)
        return query_frames, query_masks, support_frames, support_masks, self.class_list[
            list_id]

    def __gettestitem__(self, idx):
        # random.seed()
        begin_new = False
        if idx == 0:
            begin_new = True
        else:
            if self.test_video_classes[idx] != self.test_video_classes[idx -
                                                                       1]:
                begin_new = True
        list_id = self.test_video_classes[idx]
        vid_set = self.video_ids[list_id]

        support_frames, support_masks = [], []
        if begin_new:
            support_vid = random.sample(vid_set, self.support_frame)
            query_vids = []
            for id in vid_set:
                if not id in support_vid:
                    query_vids.append(id)
            self.query_ids = query_vids
            self.query_idx = -1
            for i in range(self.support_frame):
                one_frame, one_mask = self.get_GT_byclass(
                    support_vid[i], self.class_list[list_id], 1)
                support_frames += one_frame
                support_masks += one_mask

        self.query_idx += 1
        query_vid = self.query_ids[self.query_idx]
        query_frames, query_masks = self.get_GT_byclass(
            query_vid, self.class_list[list_id], test=True)

        if self.transforms is not None:
            query_frames, query_masks = self.transforms(
                query_frames, query_masks)
            if begin_new:
                if self.another_transform is not None:
                    support_frames, support_masks = self.another_transform(
                        support_frames, support_masks)
                else:
                    support_frames, support_masks = self.transforms(
                        support_frames, support_masks)
        vid_info = self.vid_infos[query_vid]
        vid_name = vid_info['dir']
        return query_frames, query_masks, support_frames, support_masks, self.class_list[
            list_id], vid_name, begin_new

    def __getitem__(self, idx):
        if self.train:
            return self.__gettrainitem__(idx)
        else:
            return self.__gettestitem__(idx)

    def __len__(self):
        return self.length

    def get_class_list(self):
        return self.class_list
Пример #2
0
class YTVOSDataset(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False,
                 every_frame=False,
                 is_flow=False,
                 flow_test=False):
        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)

        self.every_frame = every_frame
        self.is_flow = is_flow
        self.flow_test = flow_test
        if self.flow_test or self.is_flow:
            self.cuda = True
        self.cuda = False
        if self.cuda:
            from mmcv import Config
            from mmdet.models import build_detector
            from mmcv.runner import load_checkpoint
            cfg = Config.fromfile(
                "../configs/masktrack_rcnn_r50_fpn_1x_flow_youtubevos.py")
            self.det_model = build_detector(cfg.model,
                                            train_cfg=cfg.train_cfg,
                                            test_cfg=cfg.test_cfg)
            load_checkpoint(self.det_model,
                            "../results/20200312-180434/epoch_9.pth")
            self.det_model = self.det_model.cuda()
            self.det_model.eval()
            for param in self.det_model.parameters():
                param.requires_grad = False

        # Set indexes for data loading
        img_ids = []  # training frames which have annotations
        img_ids_all = []  # all training frames
        img_ids_pairs = []  # flow data pairs
        for idx, vid_info in enumerate(self.vid_infos):
            vid_name = vid_info['filenames'][0].split('/')[0]
            folder_path = osp.join(self.img_prefix, vid_name)
            files = os.listdir(folder_path)
            files.sort()
            vid_info['filenames_all'] = [
                osp.join(vid_name, file) for file in files
            ]
            for _id in range(len(files)):
                img_ids_all.append((idx, _id))
                is_anno = vid_info['filenames_all'][_id] in vid_info[
                    'filenames']
                if is_anno and _id > 0:  # having annotation and is not the first frame.
                    ann_idx = vid_info['filenames'].index(
                        vid_info['filenames_all'][_id])
                    ann = self.get_ann_info(idx, ann_idx)
                    gt_bboxes = ann['bboxes']
                    # skip the image if there is no valid gt bbox
                    if len(gt_bboxes) == 0:
                        continue
                    # random select key frame
                    key_id = _id - np.random.randint(1, min(10, _id))
                    img_ids_pairs.append(((idx, key_id), (idx, _id)))
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))

        self.img_ids = img_ids
        self.img_ids_all = img_ids_all
        self.img_ids_pairs = img_ids_pairs

        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

    def __len__(self):
        if self.every_frame:
            return len(self.img_ids_all)
        elif self.is_flow:
            return len(self.img_ids_pairs)
        else:
            return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            if self.every_frame:
                return self.prepare_test_img(self.img_ids_all[idx])
            else:
                return self.prepare_test_img(self.img_ids[idx])
        if self.is_flow:
            if self.flow_test:
                data = self.prepare_train_flow_test_img(
                    self.img_ids_pairs[idx])
            else:
                data = self.prepare_train_flow_img(self.img_ids_pairs[idx])
        else:
            data = self.prepare_train_img(self.img_ids[idx])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    def prepare_train_flow_test_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = key_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, ref_scale_factor = self.img_transform(
            key_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()

        # trans = torchvision.transforms.ToTensor()
        key_img = torch.from_numpy(key_img).cuda()
        cur_img = torch.from_numpy(cur_img).cuda()

        def resize(feat_map, size=(48, 64)):
            """Resize feature map to certain size."""
            key_feature = torch.nn.functional.interpolate(feat_map,
                                                          size,
                                                          mode='bilinear',
                                                          align_corners=True)
            return key_feature

        img_size = (384, 640)
        if key_img.shape[-2:] != img_size:
            key_img = resize(key_img.unsqueeze(0), img_size).squeeze(0)
            cur_img = resize(cur_img.unsqueeze(0), img_size).squeeze(0)

        key_feature_maps, _ = self.det_model.extract_feat(key_img.unsqueeze(0))
        cur_feature_maps, _ = self.det_model.extract_feat(cur_img.unsqueeze(0))

        key_feature_maps = [
            feat_map.squeeze(0) for feat_map in key_feature_maps
        ]
        cur_feature_maps = [
            feat_map.squeeze(0) for feat_map in cur_feature_maps
        ]

        data = dict(key_img=key_img,
                    cur_img=cur_img,
                    key_img_feats=key_feature_maps,
                    cur_img_feats=cur_feature_maps)
        return data

    def prepare_train_flow_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = cur_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None
        ann_idx = vid_info['filenames'].index(
            vid_info['filenames_all'][cur_frame_id])
        ann = self.get_ann_info(vid, ann_idx)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            cur_img, gt_bboxes, gt_labels = self.extra_aug(
                cur_img, gt_bboxes, gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False

        img_scales = [(1280, 720), (640, 360)]
        # img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scales[1], flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, key_scale_factor = self.img_transform(
            key_img, img_scales[0], flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(cur_frame_id == 0),
                        flip=flip)

        data = dict(
            img=DC(to_tensor(key_img), stack=True),
            ref_img=DC(to_tensor(cur_img), stack=True),
            img_meta=DC(img_meta, cpu_only=True),
            gt_bboxes=DC(to_tensor(gt_bboxes)),
            # ref_bboxes=DC(to_tensor(ref_bboxes))
        )
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        # if self.with_track:
        #     data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        data['train_flow'] = True

        if self.cuda:
            key_img_cuda = torch.from_numpy(key_img).cuda()
            cur_img_cuda = torch.from_numpy(cur_img).cuda()

            def resize(feat_map, size=(48, 64)):
                """Resize feature map to certain size."""
                key_feature = torch.nn.functional.interpolate(
                    feat_map, size, mode='bilinear', align_corners=True)
                return key_feature

            img_size = (384, 640)
            if key_img_cuda.shape[-2:] != img_size:
                key_img_cuda = resize(key_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)
                cur_img_cuda = resize(cur_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)

            key_feature_maps, _ = self.det_model.extract_feat(
                key_img_cuda.unsqueeze(0))
            cur_feature_maps, _ = self.det_model.extract_feat(
                cur_img_cuda.unsqueeze(0))

            key_feature_maps = [
                feat_map.squeeze(0) for feat_map in key_feature_maps
            ]
            cur_feature_maps = [
                feat_map.squeeze(0) for feat_map in cur_feature_maps
            ]

            data['key_feature_maps'] = key_feature_maps
            data['cur_feature_maps'] = cur_feature_maps

        return data

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        if self.is_flow or self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        h_orig, w_orig, _ = img.shape
        basename = osp.basename(vid_info['filenames'][frame_id])
        _, ref_frame_id = self.sample_ref(idx)
        ref_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)
        ref_ann = self.get_ann_info(vid, ref_frame_id)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']
        ref_bboxes = ref_ann['bboxes']
        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids = ref_ann['obj_ids']
        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids]
        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        img = img.copy()
        ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
            ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        ref_img = ref_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                         ref_scale_factor, flip)
        if self.aug_ref_bbox_param is not None:
            ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(frame_id == 0),
                        flip=flip)

        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=DC(to_tensor(ref_img), stack=True),
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=DC(to_tensor(ref_bboxes)))
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        is_anno = True
        if self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
            is_anno = vid_info['filenames_all'][frame_id] in vid_info[
                'filenames']
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        if self.every_frame:
            file_name = vid_info['filenames_all'][frame_id]
        else:
            file_name = vid_info['filenames'][frame_id]

        def prepare_single(img,
                           frame_id,
                           scale,
                           flip,
                           file_name,
                           proposal=None,
                           is_anno=True):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             file_name=file_name,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip,
                             is_anno=is_anno)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, file_name, proposal, is_anno)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, file_name, proposal, is_anno)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann
Пример #3
0
class YTVOSDatasetTASUF(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False):

        self.max_gap = 3

        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)
        img_ids = []
        for idx, vid_info in enumerate(self.vid_infos):
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))
        self.img_ids = img_ids
        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

        # [JH]
        self.max_bboxes_per_frame = 0

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            return self.prepare_test_img(self.img_ids[idx])
        data = self.prepare_train_img(self.img_ids[idx])
        while data == None:
            try:
                data = self.prepare_train_img(self.img_ids[idx + 1])
            except:
                data = self.prepare_train_img(self.img_ids[idx - 1])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    # sampling req sequence for TASUF
    # sequence length from 1 to 8
    # sequence direction => backward or forward
    def sample_ref_seq(self, idx):
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        vid_len = len(vid_info['filenames'])
        seq_len = random.randint(1, 8)
        if frame_id < seq_len:
            valid_samples = self.sample_ref_range(frame_id,
                                                  vid_len,
                                                  seq_len,
                                                  backward=True)
        elif frame_id + seq_len > vid_len:
            valid_samples = self.sample_ref_range(0,
                                                  frame_id,
                                                  seq_len,
                                                  backward=False)
        else:
            if random.random() < 0.5:
                valid_samples = self.sample_ref_range(frame_id,
                                                      vid_len,
                                                      seq_len,
                                                      backward=True)
            else:
                valid_samples = self.sample_ref_range(0,
                                                      frame_id,
                                                      seq_len,
                                                      backward=False)
        return valid_samples

    def sample_ref_range(self, start, end, seq_len, backward=False):
        sample_range = list(range(start, end))
        while len(sample_range) < seq_len:
            sample_range *= 2
        valid_samples = random.sample(sample_range, seq_len)
        valid_samples.sort()
        # [JW]
        for i, v in enumerate(valid_samples[:-1]):
            if valid_samples[i + 1] - v > self.max_gap:
                gap_modulation = valid_samples[i + 1] - v - self.max_gap
                for j in range(i + 1, len(valid_samples)):
                    valid_samples[j] -= gap_modulation
        if backward:
            valid_samples.reverse()
        return valid_samples

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        basename = osp.basename(vid_info['filenames'][frame_id])

        ref_frame_id_seq = self.sample_ref_seq(idx)
        ref_img_seq =\
             [mmcv.imread(osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
             for ref_frame_id in ref_frame_id_seq]

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)

        ref_ann_seq =\
             [self.get_ann_info(vid, ref_frame_id) for ref_frame_id in ref_frame_id_seq]

        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        ref_bboxes_seq = []
        for i, ref_ann in enumerate(ref_ann_seq):
            ref_bboxes = ref_ann['bboxes']
            if len(ref_bboxes) == 0:
                return None
            ref_bboxes_seq.append(ref_bboxes)

        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids_seq = [ref_ann['obj_ids'] for ref_ann in ref_ann_seq]

        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        id_set = set()
        for ref_ids in ref_ids_seq:
            id_set = id_set.union(set(ref_ids))
        id_set = sorted(list(id_set))
        gt_pids_seq = []
        for ref_ids in ref_ids_seq:
            gt_pids_seq.append([id_set.index(i) + 1 for i in ref_ids])
        gt_pids_seq.append(
            [id_set.index(i) + 1 if i in id_set else 0 for i in gt_ids])

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        img = img.copy()

        for i, ref_img in enumerate(ref_img_seq):
            ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
                ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
            ref_img = ref_img.copy()
            ref_img_seq[i] = ref_img

        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        for i, ref_bboxes in enumerate(ref_bboxes_seq):
            ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                             ref_scale_factor, flip)
            ref_bboxes_seq[i] = ref_bboxes
        if self.aug_ref_bbox_param is not None:
            for i, ref_bboxes in enumerate(ref_bboxes_seq):
                ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
                ref_bboxes_seq[i] = ref_bboexs

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        flip=flip)

        ref_img_DC_seq = []
        for ref_img in ref_img_seq:
            ref_img_DC_seq.append(DC(to_tensor(ref_img), stack=True))
        ref_bboxes_DC_seq = []
        for ref_bboxes in ref_bboxes_seq:
            ref_bboxes_DC_seq.append(DC(to_tensor(ref_bboxes)))
        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=ref_img_DC_seq,
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=ref_bboxes_DC_seq)
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            gt_pids_DC_seq = []
            for gt_pids in gt_pids_seq:
                gt_pids_DC_seq.append(DC(to_tensor(gt_pids)))
            data['gt_pids'] = gt_pids_DC_seq
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)

        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        def prepare_single(img, frame_id, scale, flip, proposal=None):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, proposal)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, proposal)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann
Пример #4
0
class YoutubeVIS(data.Dataset):
    """`YoutubeVIS <https://youtube-vos.org/dataset/vis/>`_ Dataset.
    Args:
        root (string): Root directory where images are downloaded to.
        set_name (string): Name of the specific set of COCO images.
        transform (callable, optional): A function/transform that augments the
                                        raw images`
        target_transform (callable, optional): A function/transform that takes
        in the target (bbox) and transforms it.
        prep_crowds (bool): Whether or not to prepare crowds for the evaluation step.
    """
    def __init__(self,
                 image_path,
                 info_file,
                 configs,
                 transform=None,
                 target_transform=YoutubeVISAnnotationTransform(),
                 dataset_name='YouTube VIS',
                 has_gt=True):
        # Do this here because we have too many things named COCO
        from pycocotools.ytvos import YTVOS

        self.root = image_path
        self.configs = configs

        logger = logging.getLogger("yolact.dataset")
        logger.info('Loading annotations into memory...')
        tic = time.time()
        with contextlib.redirect_stdout(io.StringIO()):
            self.coco = YTVOS(info_file)

        self.ids = list(self.coco.vidToAnns.keys())
        if len(self.ids) == 0 or not has_gt:
            self.ids = list(self.coco.vids.keys())

        logger.info('{} videos loaded in {:0.2f}s.'.format(
            len(self.ids),
            time.time() - tic))

        self.transform = transform
        self.target_transform = target_transform

        self.name = dataset_name
        self.has_gt = has_gt

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, (target, masks, num_crowds)).
                   target is the object returned by ``coco.loadAnns``.
        """
        video_frames, extra_data = self.pull_video(index)
        video_frames = [(
            im,
            (gt, masks, num_crowds),
        ) for im, gt, masks, h, w, num_crowds in video_frames]
        return video_frames, extra_data

    def pull_video(self,
                   index,
                   return_on_failure=False,
                   full_video=False,
                   max_images=-1):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, target, masks, height, width, crowd).
                   target is the object returned by ``coco.loadAnns``.
            Note that if no crowd annotations exist, crowd will be None
        """
        vid_id = self.ids[index]

        seq_len = self.configs.images_per_video

        # sample vid_id with enough length
        while True:
            vid = self.coco.loadVids(vid_id)[0]
            annot_length = len(vid['file_names'])
            if not full_video and annot_length < seq_len:
                continue  # FIXME: need to set new vid_id right?
            vid_name = vid['file_names'][0].split('/')[0]

            # Generate target starts.
            if self.has_gt:
                target = self.coco.vidToAnns[vid_id]
                ann_ids = self.coco.getAnnIds(vidIds=vid_id)

                # Target has {'segmentation', 'area', iscrowd', 'image_id', 'bboxes', 'category_id'}
                target = self.coco.loadAnns(ann_ids)
            else:
                target = []

            # Separate out crowd annotations. These are annotations that signify a large crowd of
            # objects of said class, where there is no annotation for each individual object. Both
            # during testing and training, consider these crowds as neutral.
            crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])]
            target = [
                x for x in target if not ('iscrowd' in x and x['iscrowd'])
            ]
            num_crowds = len(crowd)

            for x in crowd:
                x['category_id'] = -1

            # This is so we ensure that all crowd annotations are at the end of the array
            target += crowd
            # Generate target ends.

            # shuffling and sample a small range of video here
            if full_video:
                annot_idx = np.arange(0, annot_length, 1)
                frame_idx = np.asarray([
                    int(vid['file_names'][idx][-9:-4])
                    for idx in range(annot_length)
                ])
                if self.configs.use_all_frames:
                    key_frame_idx = frame_idx
                    frame_idx = np.arange(frame_idx[0], frame_idx[-1] + 1, 1)
                    have_annot = np.asarray(
                        [int(idx in key_frame_idx) for idx in frame_idx])
                    annot_idx = np.add.accumulate(have_annot) * have_annot - 1

                if max_images != -1:
                    eval_frames = min(max_images, len(frame_idx))
                    # start_idx = np.random.randint(0, len(frame_idx) - eval_frames + 1)
                    start_idx = 0
                    frame_idx = frame_idx[start_idx:start_idx + eval_frames]
                    annot_idx = annot_idx[start_idx:start_idx + eval_frames]
            elif self.configs.use_all_frames:
                rand_idx = np.arange(0, annot_length - seq_len)
                np.random.shuffle(rand_idx)

                direction = 1
                if self.configs.all_frame_direction == 'allway':
                    if np.random.rand() > 0.5: direction *= -1
                elif self.configs.all_frame_direction == 'forward':
                    # Note: forward warping needs to sample a 'previous frame'
                    direction *= -1
                elif self.configs.all_frame_direction == 'backward':
                    pass
                else:
                    raise ValueError("Unexpected frame direction: %s" %
                                     self.configs.all_frame_direction)

                start_idx = rand_idx[0]
                if direction < 0:
                    start_idx += self.configs.images_per_video
                start_frame_idx = int(vid['file_names'][start_idx][-9:-4])
                annot_idx = [start_idx]
                frame_idx = [start_frame_idx]

                # if self.configs.images_per_video > 1:
                #     num_extra_frames = self.configs.images_per_video - 1
                #     extra_annot_idx = [start_idx + direction * offset_idx
                #                        for offset_idx in range(1, num_extra_frames + 1)]
                #     extra_frame_idx = [int(vid['file_names'][extra_idx][-9:-4])
                #                        for extra_idx in extra_annot_idx]
                #
                #     annot_idx += extra_annot_idx
                #     frame_idx += extra_frame_idx

                extra_frame_idx = []
                extra_annot_idx = []
                if self.configs.images_per_video > 0:
                    offset_lb, offset_ub = self.configs.frame_offset_lb, self.configs.frame_offset_ub
                    lb, ub = int(vid['file_names'][0][-9:-4]), int(
                        vid['file_names'][-1][-9:-4])
                    fidx = frame_idx[-1]
                    lb, ub = lb - fidx, ub - fidx
                    if direction == -1:
                        ub = -offset_lb
                        lb = max(lb, -offset_ub)
                    else:
                        lb = offset_lb
                        ub = min(ub, offset_ub)
                    assert lb <= ub + 1, "{}, {}".format(lb, ub)
                    assert self.configs.frame_offset_multiplier == 1, "frame_offset_multiplier deprecated."
                    for _ in range(self.configs.images_per_video):
                        frame_diff = np.random.randint(lb, ub + 1)
                        ref_idx = fidx + frame_diff
                        assert int(
                            vid['file_names'][0][-9:-4]) <= ref_idx <= int(
                                vid['file_names'][-1]
                                [-9:-4]), "{} <= {} <= {}".format(
                                    int(vid['file_names'][0][-9:-4]), ref_idx,
                                    int(vid['file_names'][-1][-9:-4]))
                        # frame_diff = self.configs.frame_offset_multiplier * np.random.randint(self.configs.frame_offset_lb, self.configs.frame_offset_ub + 1)
                        # ref_idx = np.clip(frame_idx[-1] + frame_diff * direction,
                        #                   int(vid['file_names'][0][-9:-4]), int(vid['file_names'][-1][-9:-4]))
                        extra_frame_idx += [ref_idx]
                        extra_annot_idx += [-1]

                extra_frame_idx = list(sorted(extra_frame_idx, reverse=True))

                annot_idx += extra_annot_idx
                frame_idx += extra_frame_idx
                annot_idx = np.asarray(annot_idx)
                frame_idx = np.asarray(frame_idx)
            else:
                rand_idx = np.arange(0, annot_length - seq_len + 1)
                np.random.shuffle(rand_idx)
                start_idx = rand_idx[0]

                annot_idx = np.arange(start_idx, start_idx + seq_len, 1)
                frame_idx = np.asarray(
                    [int(vid['file_names'][idx][-9:-4]) for idx in annot_idx])

            has_targets = all([
                self.target_in_frame(target, annot_id, true_on_reference=True)
                for annot_id in annot_idx
            ])
            if has_targets: break
            if return_on_failure: return None
            # print("Not all frame of video %s[%d-%d] has targets, re-selecting video." %
            #       (vid['file_names'][0].split('/')[0], start_idx, start_idx + frm_len))
            index = np.random.randint(len(self))
            vid_id = self.ids[index]

        frame_results = []
        extra_data = []

        while True:
            try:
                for idx, (frame_id, annot_id) in enumerate(
                        zip(frame_idx.tolist(), annot_idx.tolist())):
                    extra = {}
                    # FIXME: little bit hacky for full frames, maybe fix this using annotation files
                    frame_id_str = "%05d" % frame_id
                    file_name = vid['file_names'][0]
                    file_name = file_name[:-9] + frame_id_str + file_name[-4:]
                    prev_frame_id = frame_idx[idx - 1] if idx > 0 else -1
                    prev_annot_id = annot_idx[idx - 1] if idx > 0 else -1
                    if idx == 0:
                        seeds, (im, gt, masks, h, w,
                                num_crowds) = self.pull_frame(
                                    vid_name, (frame_id, annot_id),
                                    (prev_frame_id, prev_annot_id),
                                    file_name,
                                    target,
                                    num_crowds,
                                    require_seeds=True)
                    else:
                        im, gt, masks, h, w, num_crowds = self.pull_frame(
                            vid_name, (frame_id, annot_id),
                            (prev_frame_id, prev_annot_id),
                            file_name,
                            target,
                            num_crowds,
                            seeds=seeds)

                    extra['idx'] = (
                        frame_id,
                        annot_id,
                    )
                    frame_results.append((
                        im,
                        gt,
                        masks,
                        h,
                        w,
                        num_crowds,
                    ))
                    extra_data.append(extra)
            except ValueError as e:
                logger = logging.getLogger("yolact.dataset")
                logger.warning('Resampling with reseed signal...')
                frame_results.clear()
                extra_data.clear()
                continue
            break

        return frame_results, extra_data

    def __len__(self):
        return len(self.ids)

    @staticmethod
    def target_in_frame(target, frame_id, true_on_reference=False):
        if frame_id < 0:
            return true_on_reference
        if len(target) > 0:
            for obj in target:
                if obj['segmentations'][frame_id] is not None:
                    return True
        return False

    def pull_frame(self,
                   vid_name,
                   frame_annot_id,
                   prev_frame_annot_id,
                   file_name,
                   target,
                   num_crowds,
                   require_seeds=False,
                   seeds=None):
        frame_id, annot_id = frame_annot_id
        prev_frame_id, prev_annot_id = prev_frame_annot_id
        path = osp.join(self.root, file_name)
        assert osp.exists(path), 'Image path does not exist: {}'.format(path)

        img = cv2.imread(path)
        height, width, _ = img.shape

        target_is_in_frame = self.target_in_frame(target, annot_id)

        if target_is_in_frame:
            # Pool all the masks for this image into one [num_objects,height,width] matrix

            # masks = [np.zeros(height * width, dtype=np.uint8).reshape(-1) if obj['segmentations'][frame_id] is None  # all-zero mask on None
            #          else self.coco.annToMask(obj, frame_id).reshape(-1) for obj in target]
            masks = [
                self.coco.annToMask(obj, annot_id).reshape(-1)
                for obj in target if obj['segmentations'][annot_id] is not None
            ]
            masks = np.vstack(masks)
            masks = masks.reshape(-1, height, width)

        if self.target_transform is not None and target_is_in_frame:
            target = self.target_transform(target, annot_id, width, height)

        if self.transform is not None:
            if "Video" in type(self.transform).__name__:
                if target_is_in_frame:
                    target = np.array(target)
                    return_transform = self.transform(
                        img,
                        masks,
                        target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        },
                        require_seeds=require_seeds,
                        seeds=seeds)

                    if require_seeds:
                        seeds, (img, masks, boxes, labels) = return_transform
                    else:
                        img, masks, boxes, labels = return_transform

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))

                    if target.shape[0] == 0:
                        logger = logging.getLogger("yolact.dataset")
                        logger.warning(
                            'Augmentation output an example with no ground truth. Resampling...'
                        )
                        raise ValueError("reseed")
                else:
                    try:
                        return_transform = self.transform(
                            img,
                            np.zeros((1, height, width), dtype=np.float),
                            np.array([[0., 0., 1., 1.]]), {
                                'num_crowds': 0,
                                'labels': np.array([0])
                            },
                            require_seeds=require_seeds,
                            seeds=seeds)
                    except ValueError:
                        assert False, "Unexpected reseed captured with no-target instances."

                    if require_seeds:
                        seeds, (img, _, _, _) = return_transform
                    else:
                        img, _, _, _ = return_transform

                    masks = None
                    target = None
            else:
                if target_is_in_frame:
                    target = np.array(target)
                    img, masks, boxes, labels = self.transform(
                        img, masks, target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        })

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
                else:
                    img, _, _, _ = self.transform(
                        img, np.zeros((1, height, width), dtype=np.float),
                        np.array([[0, 0, 1, 1]]), {
                            'num_crowds': 0,
                            'labels': np.array([0])
                        })
                    masks = None
                    target = None

        return_tuple = torch.from_numpy(img).permute(
            2, 0, 1), target, masks, height, width, num_crowds
        if require_seeds:
            return seeds, return_tuple
        else:
            return return_tuple

    def pull_image(self, index):
        '''Returns the original image object at index in PIL form

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to show
        Return:
            cv2 img
        '''
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]['file_name']
        return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)

    def pull_anno(self, index):
        '''Returns the original annotation of image at index

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to get annotation of
        Return:
            list:  [img_id, [(label, bbox coords),...]]
                eg: ('001718', [('dog', (96, 13, 438, 332))])
        '''
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        return self.coco.loadAnns(ann_ids)

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(
            tmp,
            self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(
            tmp,
            self.target_transform.__repr__().replace('\n',
                                                     '\n' + ' ' * len(tmp)))
        return fmt_str