Python YTVOS.annToMask примеры использования

Язык программирования: Python

Пространство имен/Пакет: pycocotools.ytvos

Класс/Тип: YTVOS

Метод/Функция: annToMask

Примеров на hotexamples.com: 4

Python YTVOS.annToMask - 4 примера найдено. Это лучшие примеры Python кода для pycocotools.ytvos.YTVOS.annToMask, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

loadAnns(6)

YTVOS(6)

getAnnIds(5)

loadVids(5)

annToMask(4)

getCatIds(4)

getVidIds(3)

loadRes(2)

anns(1)

loadImgs(1)

Пример #1

Показать файл

Файл: YoutubeVOS.py Проект: scutpaul/DANet

class YTVOSDataset(Dataset):
    def __init__(self,
                 data_path=None,
                 train=True,
                 valid=False,
                 set_index=1,
                 finetune_idx=None,
                 support_frame=5,
                 query_frame=1,
                 sample_per_class=10,
                 transforms=None,
                 another_transform=None):
        self.train = train
        self.valid = valid
        self.set_index = set_index
        self.support_frame = support_frame
        self.query_frame = query_frame
        self.sample_per_class = sample_per_class
        self.transforms = transforms
        self.another_transform = another_transform

        if data_path is None:
            data_path = os.path.join(os.path.expanduser('~'), 'Lab/DANet')
        data_dir = os.path.join(data_path, 'data', 'Youtube-VOS')
        self.img_dir = os.path.join(data_dir, 'train', 'JPEGImages')
        self.ann_file = os.path.join(data_dir, 'train', 'train.json')

        self.load_annotations()

        print('data set index: ', set_index)
        self.train_list = [
            n + 1 for n in range(40) if n % 4 != (set_index - 1)
        ]
        self.valid_list = [
            n + 1 for n in range(40) if n % 4 == (set_index - 1)
        ]

        if train and not valid:
            self.class_list = self.train_list
        else:
            self.class_list = self.valid_list
        if finetune_idx is not None:
            self.class_list = [self.class_list[finetune_idx]]

        self.video_ids = []
        for class_id in self.class_list:
            tmp_list = self.ytvos.getVidIds(catIds=class_id)
            tmp_list.sort()
            self.video_ids.append(tmp_list)  # list[list[video_id]]
        if not self.train:
            self.test_video_classes = []
            for i in range(len(self.class_list)):
                for j in range(len(self.video_ids[i]) -
                               support_frame):  # remove the support set
                    self.test_video_classes.append(i)

        if self.train:
            self.length = len(self.class_list) * sample_per_class
        else:
            self.length = len(self.test_video_classes)  # test

    def load_annotations(self):
        self.ytvos = YTVOS(self.ann_file)
        self.vid_ids = self.ytvos.getVidIds()  # list[2238] begin : 1
        self.vid_infos = self.ytvos.vids  # vids
        for vid, vid_info in self.vid_infos.items():  # for each vid
            vid_name = vid_info['file_names'][0].split('/')[0]  # '0043f083b5'
            vid_info['dir'] = vid_name
            frame_len = vid_info['length']  # int
            frame_object, frame_class = [], []
            for i in range(frame_len):
                frame_object.append([])
            for i in range(frame_len):
                frame_class.append([])
            category_set = set()
            annos = self.ytvos.vidToAnns[vid]  # list[]
            for anno in annos:  # instance_level anns
                assert len(anno['segmentations']) == frame_len, (
                    vid_name, len(anno['segmentations']), vid_info['length'])
                for frame_idx in range(frame_len):
                    anno_segmentation = anno['segmentations'][frame_idx]
                    if anno_segmentation is not None:
                        frame_object[frame_idx].append(
                            anno['id'])  # add instance to vid_frame
                        frame_class[frame_idx].append(
                            anno['category_id']
                        )  # add instance class to vid_frame
                        category_set = category_set.union(
                            {anno['category_id']})
            vid_info['objects'] = frame_object
            vid_info['classes'] = frame_class
            class_frame_id = dict()
            for class_id in category_set:  # frames index for each class
                class_frame_id[class_id] = [
                    i for i in range(frame_len) if class_id in frame_class[i]
                ]
            vid_info['class_frames'] = class_frame_id

    def get_GT_byclass(self, vid, class_id, frame_num=1, test=False):
        vid_info = self.vid_infos[vid]
        frame_list = vid_info['class_frames'][class_id]
        frame_len = len(frame_list)
        choice_frame = random.sample(frame_list, 1)
        if test:
            frame_num = frame_len
        if frame_num > 1:
            if frame_num <= frame_len:
                choice_idx = frame_list.index(choice_frame[0])
                if choice_idx < frame_num:
                    begin_idx = 0
                    end_idx = frame_num
                else:
                    begin_idx = choice_idx - frame_num + 1
                    end_idx = choice_idx + 1
                choice_frame = [
                    frame_list[n] for n in range(begin_idx, end_idx)
                ]
            else:
                choice_frame = []
                for i in range(frame_num):
                    if i < frame_len:
                        choice_frame.append(frame_list[i])
                    else:
                        choice_frame.append(frame_list[frame_len - 1])
        frames = [
            np.array(
                Image.open(
                    os.path.join(self.img_dir,
                                 vid_info['file_names'][frame_idx])))
            for frame_idx in choice_frame
        ]
        masks = []
        for frame_id in choice_frame:
            object_ids = vid_info['objects'][frame_id]
            mask = None
            for object_id in object_ids:
                ann = self.ytvos.loadAnns(object_id)[0]
                if ann['category_id'] not in self.class_list:
                    continue
                track_id = 1
                if ann['category_id'] != class_id:
                    track_id = 0
                temp_mask = self.ytvos.annToMask(ann, frame_id)
                if mask is None:
                    mask = temp_mask * track_id
                else:
                    mask += temp_mask * track_id

            assert mask is not None
            mask[mask > 0] = 1
            masks.append(mask)

        return frames, masks

    def __gettrainitem__(self, idx):
        list_id = idx // self.sample_per_class
        vid_set = self.video_ids[list_id]

        query_vid = random.sample(vid_set, 1)
        support_vid = random.sample(vid_set, self.support_frame)

        query_frames, query_masks = self.get_GT_byclass(
            query_vid[0], self.class_list[list_id], self.query_frame)

        support_frames, support_masks = [], []
        for i in range(self.support_frame):
            one_frame, one_mask = self.get_GT_byclass(support_vid[i],
                                                      self.class_list[list_id],
                                                      1)
            support_frames += one_frame
            support_masks += one_mask

        if self.transforms is not None:
            query_frames, query_masks = self.transforms(
                query_frames, query_masks)
            support_frames, support_masks = self.transforms(
                support_frames, support_masks)
        return query_frames, query_masks, support_frames, support_masks, self.class_list[
            list_id]

    def __gettestitem__(self, idx):
        # random.seed()
        begin_new = False
        if idx == 0:
            begin_new = True
        else:
            if self.test_video_classes[idx] != self.test_video_classes[idx -
                                                                       1]:
                begin_new = True
        list_id = self.test_video_classes[idx]
        vid_set = self.video_ids[list_id]

        support_frames, support_masks = [], []
        if begin_new:
            support_vid = random.sample(vid_set, self.support_frame)
            query_vids = []
            for id in vid_set:
                if not id in support_vid:
                    query_vids.append(id)
            self.query_ids = query_vids
            self.query_idx = -1
            for i in range(self.support_frame):
                one_frame, one_mask = self.get_GT_byclass(
                    support_vid[i], self.class_list[list_id], 1)
                support_frames += one_frame
                support_masks += one_mask

        self.query_idx += 1
        query_vid = self.query_ids[self.query_idx]
        query_frames, query_masks = self.get_GT_byclass(
            query_vid, self.class_list[list_id], test=True)

        if self.transforms is not None:
            query_frames, query_masks = self.transforms(
                query_frames, query_masks)
            if begin_new:
                if self.another_transform is not None:
                    support_frames, support_masks = self.another_transform(
                        support_frames, support_masks)
                else:
                    support_frames, support_masks = self.transforms(
                        support_frames, support_masks)
        vid_info = self.vid_infos[query_vid]
        vid_name = vid_info['dir']
        return query_frames, query_masks, support_frames, support_masks, self.class_list[
            list_id], vid_name, begin_new

    def __getitem__(self, idx):
        if self.train:
            return self.__gettrainitem__(idx)
        else:
            return self.__gettestitem__(idx)

    def __len__(self):
        return self.length

    def get_class_list(self):
        return self.class_list

Пример #2

Показать файл

class YTVOSDataset(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False,
                 every_frame=False,
                 is_flow=False,
                 flow_test=False):
        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)

        self.every_frame = every_frame
        self.is_flow = is_flow
        self.flow_test = flow_test
        if self.flow_test or self.is_flow:
            self.cuda = True
        self.cuda = False
        if self.cuda:
            from mmcv import Config
            from mmdet.models import build_detector
            from mmcv.runner import load_checkpoint
            cfg = Config.fromfile(
                "../configs/masktrack_rcnn_r50_fpn_1x_flow_youtubevos.py")
            self.det_model = build_detector(cfg.model,
                                            train_cfg=cfg.train_cfg,
                                            test_cfg=cfg.test_cfg)
            load_checkpoint(self.det_model,
                            "../results/20200312-180434/epoch_9.pth")
            self.det_model = self.det_model.cuda()
            self.det_model.eval()
            for param in self.det_model.parameters():
                param.requires_grad = False

        # Set indexes for data loading
        img_ids = []  # training frames which have annotations
        img_ids_all = []  # all training frames
        img_ids_pairs = []  # flow data pairs
        for idx, vid_info in enumerate(self.vid_infos):
            vid_name = vid_info['filenames'][0].split('/')[0]
            folder_path = osp.join(self.img_prefix, vid_name)
            files = os.listdir(folder_path)
            files.sort()
            vid_info['filenames_all'] = [
                osp.join(vid_name, file) for file in files
            ]
            for _id in range(len(files)):
                img_ids_all.append((idx, _id))
                is_anno = vid_info['filenames_all'][_id] in vid_info[
                    'filenames']
                if is_anno and _id > 0:  # having annotation and is not the first frame.
                    ann_idx = vid_info['filenames'].index(
                        vid_info['filenames_all'][_id])
                    ann = self.get_ann_info(idx, ann_idx)
                    gt_bboxes = ann['bboxes']
                    # skip the image if there is no valid gt bbox
                    if len(gt_bboxes) == 0:
                        continue
                    # random select key frame
                    key_id = _id - np.random.randint(1, min(10, _id))
                    img_ids_pairs.append(((idx, key_id), (idx, _id)))
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))

        self.img_ids = img_ids
        self.img_ids_all = img_ids_all
        self.img_ids_pairs = img_ids_pairs

        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

    def __len__(self):
        if self.every_frame:
            return len(self.img_ids_all)
        elif self.is_flow:
            return len(self.img_ids_pairs)
        else:
            return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            if self.every_frame:
                return self.prepare_test_img(self.img_ids_all[idx])
            else:
                return self.prepare_test_img(self.img_ids[idx])
        if self.is_flow:
            if self.flow_test:
                data = self.prepare_train_flow_test_img(
                    self.img_ids_pairs[idx])
            else:
                data = self.prepare_train_flow_img(self.img_ids_pairs[idx])
        else:
            data = self.prepare_train_img(self.img_ids[idx])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    def prepare_train_flow_test_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = key_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, ref_scale_factor = self.img_transform(
            key_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()

        # trans = torchvision.transforms.ToTensor()
        key_img = torch.from_numpy(key_img).cuda()
        cur_img = torch.from_numpy(cur_img).cuda()

        def resize(feat_map, size=(48, 64)):
            """Resize feature map to certain size."""
            key_feature = torch.nn.functional.interpolate(feat_map,
                                                          size,
                                                          mode='bilinear',
                                                          align_corners=True)
            return key_feature

        img_size = (384, 640)
        if key_img.shape[-2:] != img_size:
            key_img = resize(key_img.unsqueeze(0), img_size).squeeze(0)
            cur_img = resize(cur_img.unsqueeze(0), img_size).squeeze(0)

        key_feature_maps, _ = self.det_model.extract_feat(key_img.unsqueeze(0))
        cur_feature_maps, _ = self.det_model.extract_feat(cur_img.unsqueeze(0))

        key_feature_maps = [
            feat_map.squeeze(0) for feat_map in key_feature_maps
        ]
        cur_feature_maps = [
            feat_map.squeeze(0) for feat_map in cur_feature_maps
        ]

        data = dict(key_img=key_img,
                    cur_img=cur_img,
                    key_img_feats=key_feature_maps,
                    cur_img_feats=cur_feature_maps)
        return data

    def prepare_train_flow_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = cur_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None
        ann_idx = vid_info['filenames'].index(
            vid_info['filenames_all'][cur_frame_id])
        ann = self.get_ann_info(vid, ann_idx)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            cur_img, gt_bboxes, gt_labels = self.extra_aug(
                cur_img, gt_bboxes, gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False

        img_scales = [(1280, 720), (640, 360)]
        # img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scales[1], flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, key_scale_factor = self.img_transform(
            key_img, img_scales[0], flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(cur_frame_id == 0),
                        flip=flip)

        data = dict(
            img=DC(to_tensor(key_img), stack=True),
            ref_img=DC(to_tensor(cur_img), stack=True),
            img_meta=DC(img_meta, cpu_only=True),
            gt_bboxes=DC(to_tensor(gt_bboxes)),
            # ref_bboxes=DC(to_tensor(ref_bboxes))
        )
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        # if self.with_track:
        #     data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        data['train_flow'] = True

        if self.cuda:
            key_img_cuda = torch.from_numpy(key_img).cuda()
            cur_img_cuda = torch.from_numpy(cur_img).cuda()

            def resize(feat_map, size=(48, 64)):
                """Resize feature map to certain size."""
                key_feature = torch.nn.functional.interpolate(
                    feat_map, size, mode='bilinear', align_corners=True)
                return key_feature

            img_size = (384, 640)
            if key_img_cuda.shape[-2:] != img_size:
                key_img_cuda = resize(key_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)
                cur_img_cuda = resize(cur_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)

            key_feature_maps, _ = self.det_model.extract_feat(
                key_img_cuda.unsqueeze(0))
            cur_feature_maps, _ = self.det_model.extract_feat(
                cur_img_cuda.unsqueeze(0))

            key_feature_maps = [
                feat_map.squeeze(0) for feat_map in key_feature_maps
            ]
            cur_feature_maps = [
                feat_map.squeeze(0) for feat_map in cur_feature_maps
            ]

            data['key_feature_maps'] = key_feature_maps
            data['cur_feature_maps'] = cur_feature_maps

        return data

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        if self.is_flow or self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        h_orig, w_orig, _ = img.shape
        basename = osp.basename(vid_info['filenames'][frame_id])
        _, ref_frame_id = self.sample_ref(idx)
        ref_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)
        ref_ann = self.get_ann_info(vid, ref_frame_id)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']
        ref_bboxes = ref_ann['bboxes']
        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids = ref_ann['obj_ids']
        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids]
        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        img = img.copy()
        ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
            ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        ref_img = ref_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                         ref_scale_factor, flip)
        if self.aug_ref_bbox_param is not None:
            ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(frame_id == 0),
                        flip=flip)

        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=DC(to_tensor(ref_img), stack=True),
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=DC(to_tensor(ref_bboxes)))
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        is_anno = True
        if self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
            is_anno = vid_info['filenames_all'][frame_id] in vid_info[
                'filenames']
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        if self.every_frame:
            file_name = vid_info['filenames_all'][frame_id]
        else:
            file_name = vid_info['filenames'][frame_id]

        def prepare_single(img,
                           frame_id,
                           scale,
                           flip,
                           file_name,
                           proposal=None,
                           is_anno=True):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             file_name=file_name,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip,
                             is_anno=is_anno)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, file_name, proposal, is_anno)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, file_name, proposal, is_anno)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann

Пример #3

Показать файл

Файл: ytvos_tasuf.py Проект: janghyuk-choi/TASUF

class YTVOSDatasetTASUF(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False):

        self.max_gap = 3

        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)
        img_ids = []
        for idx, vid_info in enumerate(self.vid_infos):
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))
        self.img_ids = img_ids
        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

        # [JH]
        self.max_bboxes_per_frame = 0

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            return self.prepare_test_img(self.img_ids[idx])
        data = self.prepare_train_img(self.img_ids[idx])
        while data == None:
            try:
                data = self.prepare_train_img(self.img_ids[idx + 1])
            except:
                data = self.prepare_train_img(self.img_ids[idx - 1])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    # sampling req sequence for TASUF
    # sequence length from 1 to 8
    # sequence direction => backward or forward
    def sample_ref_seq(self, idx):
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        vid_len = len(vid_info['filenames'])
        seq_len = random.randint(1, 8)
        if frame_id < seq_len:
            valid_samples = self.sample_ref_range(frame_id,
                                                  vid_len,
                                                  seq_len,
                                                  backward=True)
        elif frame_id + seq_len > vid_len:
            valid_samples = self.sample_ref_range(0,
                                                  frame_id,
                                                  seq_len,
                                                  backward=False)
        else:
            if random.random() < 0.5:
                valid_samples = self.sample_ref_range(frame_id,
                                                      vid_len,
                                                      seq_len,
                                                      backward=True)
            else:
                valid_samples = self.sample_ref_range(0,
                                                      frame_id,
                                                      seq_len,
                                                      backward=False)
        return valid_samples

    def sample_ref_range(self, start, end, seq_len, backward=False):
        sample_range = list(range(start, end))
        while len(sample_range) < seq_len:
            sample_range *= 2
        valid_samples = random.sample(sample_range, seq_len)
        valid_samples.sort()
        # [JW]
        for i, v in enumerate(valid_samples[:-1]):
            if valid_samples[i + 1] - v > self.max_gap:
                gap_modulation = valid_samples[i + 1] - v - self.max_gap
                for j in range(i + 1, len(valid_samples)):
                    valid_samples[j] -= gap_modulation
        if backward:
            valid_samples.reverse()
        return valid_samples

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        basename = osp.basename(vid_info['filenames'][frame_id])

        ref_frame_id_seq = self.sample_ref_seq(idx)
        ref_img_seq =\
             [mmcv.imread(osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
             for ref_frame_id in ref_frame_id_seq]

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)

        ref_ann_seq =\
             [self.get_ann_info(vid, ref_frame_id) for ref_frame_id in ref_frame_id_seq]

        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        ref_bboxes_seq = []
        for i, ref_ann in enumerate(ref_ann_seq):
            ref_bboxes = ref_ann['bboxes']
            if len(ref_bboxes) == 0:
                return None
            ref_bboxes_seq.append(ref_bboxes)

        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids_seq = [ref_ann['obj_ids'] for ref_ann in ref_ann_seq]

        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        id_set = set()
        for ref_ids in ref_ids_seq:
            id_set = id_set.union(set(ref_ids))
        id_set = sorted(list(id_set))
        gt_pids_seq = []
        for ref_ids in ref_ids_seq:
            gt_pids_seq.append([id_set.index(i) + 1 for i in ref_ids])
        gt_pids_seq.append(
            [id_set.index(i) + 1 if i in id_set else 0 for i in gt_ids])

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        img = img.copy()

        for i, ref_img in enumerate(ref_img_seq):
            ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
                ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
            ref_img = ref_img.copy()
            ref_img_seq[i] = ref_img

        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        for i, ref_bboxes in enumerate(ref_bboxes_seq):
            ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                             ref_scale_factor, flip)
            ref_bboxes_seq[i] = ref_bboxes
        if self.aug_ref_bbox_param is not None:
            for i, ref_bboxes in enumerate(ref_bboxes_seq):
                ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
                ref_bboxes_seq[i] = ref_bboexs

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        flip=flip)

        ref_img_DC_seq = []
        for ref_img in ref_img_seq:
            ref_img_DC_seq.append(DC(to_tensor(ref_img), stack=True))
        ref_bboxes_DC_seq = []
        for ref_bboxes in ref_bboxes_seq:
            ref_bboxes_DC_seq.append(DC(to_tensor(ref_bboxes)))
        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=ref_img_DC_seq,
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=ref_bboxes_DC_seq)
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            gt_pids_DC_seq = []
            for gt_pids in gt_pids_seq:
                gt_pids_DC_seq.append(DC(to_tensor(gt_pids)))
            data['gt_pids'] = gt_pids_DC_seq
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)

        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        def prepare_single(img, frame_id, scale, flip, proposal=None):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, proposal)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, proposal)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann

Пример #4

Показать файл

class YoutubeVIS(data.Dataset):
    """`YoutubeVIS <https://youtube-vos.org/dataset/vis/>`_ Dataset.
    Args:
        root (string): Root directory where images are downloaded to.
        set_name (string): Name of the specific set of COCO images.
        transform (callable, optional): A function/transform that augments the
                                        raw images`
        target_transform (callable, optional): A function/transform that takes
        in the target (bbox) and transforms it.
        prep_crowds (bool): Whether or not to prepare crowds for the evaluation step.
    """
    def __init__(self,
                 image_path,
                 info_file,
                 configs,
                 transform=None,
                 target_transform=YoutubeVISAnnotationTransform(),
                 dataset_name='YouTube VIS',
                 has_gt=True):
        # Do this here because we have too many things named COCO
        from pycocotools.ytvos import YTVOS

        self.root = image_path
        self.configs = configs

        logger = logging.getLogger("yolact.dataset")
        logger.info('Loading annotations into memory...')
        tic = time.time()
        with contextlib.redirect_stdout(io.StringIO()):
            self.coco = YTVOS(info_file)

        self.ids = list(self.coco.vidToAnns.keys())
        if len(self.ids) == 0 or not has_gt:
            self.ids = list(self.coco.vids.keys())

        logger.info('{} videos loaded in {:0.2f}s.'.format(
            len(self.ids),
            time.time() - tic))

        self.transform = transform
        self.target_transform = target_transform

        self.name = dataset_name
        self.has_gt = has_gt

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, (target, masks, num_crowds)).
                   target is the object returned by ``coco.loadAnns``.
        """
        video_frames, extra_data = self.pull_video(index)
        video_frames = [(
            im,
            (gt, masks, num_crowds),
        ) for im, gt, masks, h, w, num_crowds in video_frames]
        return video_frames, extra_data

    def pull_video(self,
                   index,
                   return_on_failure=False,
                   full_video=False,
                   max_images=-1):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, target, masks, height, width, crowd).
                   target is the object returned by ``coco.loadAnns``.
            Note that if no crowd annotations exist, crowd will be None
        """
        vid_id = self.ids[index]

        seq_len = self.configs.images_per_video

        # sample vid_id with enough length
        while True:
            vid = self.coco.loadVids(vid_id)[0]
            annot_length = len(vid['file_names'])
            if not full_video and annot_length < seq_len:
                continue  # FIXME: need to set new vid_id right?
            vid_name = vid['file_names'][0].split('/')[0]

            # Generate target starts.
            if self.has_gt:
                target = self.coco.vidToAnns[vid_id]
                ann_ids = self.coco.getAnnIds(vidIds=vid_id)

                # Target has {'segmentation', 'area', iscrowd', 'image_id', 'bboxes', 'category_id'}
                target = self.coco.loadAnns(ann_ids)
            else:
                target = []

            # Separate out crowd annotations. These are annotations that signify a large crowd of
            # objects of said class, where there is no annotation for each individual object. Both
            # during testing and training, consider these crowds as neutral.
            crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])]
            target = [
                x for x in target if not ('iscrowd' in x and x['iscrowd'])
            ]
            num_crowds = len(crowd)

            for x in crowd:
                x['category_id'] = -1

            # This is so we ensure that all crowd annotations are at the end of the array
            target += crowd
            # Generate target ends.

            # shuffling and sample a small range of video here
            if full_video:
                annot_idx = np.arange(0, annot_length, 1)
                frame_idx = np.asarray([
                    int(vid['file_names'][idx][-9:-4])
                    for idx in range(annot_length)
                ])
                if self.configs.use_all_frames:
                    key_frame_idx = frame_idx
                    frame_idx = np.arange(frame_idx[0], frame_idx[-1] + 1, 1)
                    have_annot = np.asarray(
                        [int(idx in key_frame_idx) for idx in frame_idx])
                    annot_idx = np.add.accumulate(have_annot) * have_annot - 1

                if max_images != -1:
                    eval_frames = min(max_images, len(frame_idx))
                    # start_idx = np.random.randint(0, len(frame_idx) - eval_frames + 1)
                    start_idx = 0
                    frame_idx = frame_idx[start_idx:start_idx + eval_frames]
                    annot_idx = annot_idx[start_idx:start_idx + eval_frames]
            elif self.configs.use_all_frames:
                rand_idx = np.arange(0, annot_length - seq_len)
                np.random.shuffle(rand_idx)

                direction = 1
                if self.configs.all_frame_direction == 'allway':
                    if np.random.rand() > 0.5: direction *= -1
                elif self.configs.all_frame_direction == 'forward':
                    # Note: forward warping needs to sample a 'previous frame'
                    direction *= -1
                elif self.configs.all_frame_direction == 'backward':
                    pass
                else:
                    raise ValueError("Unexpected frame direction: %s" %
                                     self.configs.all_frame_direction)

                start_idx = rand_idx[0]
                if direction < 0:
                    start_idx += self.configs.images_per_video
                start_frame_idx = int(vid['file_names'][start_idx][-9:-4])
                annot_idx = [start_idx]
                frame_idx = [start_frame_idx]

                # if self.configs.images_per_video > 1:
                #     num_extra_frames = self.configs.images_per_video - 1
                #     extra_annot_idx = [start_idx + direction * offset_idx
                #                        for offset_idx in range(1, num_extra_frames + 1)]
                #     extra_frame_idx = [int(vid['file_names'][extra_idx][-9:-4])
                #                        for extra_idx in extra_annot_idx]
                #
                #     annot_idx += extra_annot_idx
                #     frame_idx += extra_frame_idx

                extra_frame_idx = []
                extra_annot_idx = []
                if self.configs.images_per_video > 0:
                    offset_lb, offset_ub = self.configs.frame_offset_lb, self.configs.frame_offset_ub
                    lb, ub = int(vid['file_names'][0][-9:-4]), int(
                        vid['file_names'][-1][-9:-4])
                    fidx = frame_idx[-1]
                    lb, ub = lb - fidx, ub - fidx
                    if direction == -1:
                        ub = -offset_lb
                        lb = max(lb, -offset_ub)
                    else:
                        lb = offset_lb
                        ub = min(ub, offset_ub)
                    assert lb <= ub + 1, "{}, {}".format(lb, ub)
                    assert self.configs.frame_offset_multiplier == 1, "frame_offset_multiplier deprecated."
                    for _ in range(self.configs.images_per_video):
                        frame_diff = np.random.randint(lb, ub + 1)
                        ref_idx = fidx + frame_diff
                        assert int(
                            vid['file_names'][0][-9:-4]) <= ref_idx <= int(
                                vid['file_names'][-1]
                                [-9:-4]), "{} <= {} <= {}".format(
                                    int(vid['file_names'][0][-9:-4]), ref_idx,
                                    int(vid['file_names'][-1][-9:-4]))
                        # frame_diff = self.configs.frame_offset_multiplier * np.random.randint(self.configs.frame_offset_lb, self.configs.frame_offset_ub + 1)
                        # ref_idx = np.clip(frame_idx[-1] + frame_diff * direction,
                        #                   int(vid['file_names'][0][-9:-4]), int(vid['file_names'][-1][-9:-4]))
                        extra_frame_idx += [ref_idx]
                        extra_annot_idx += [-1]

                extra_frame_idx = list(sorted(extra_frame_idx, reverse=True))

                annot_idx += extra_annot_idx
                frame_idx += extra_frame_idx
                annot_idx = np.asarray(annot_idx)
                frame_idx = np.asarray(frame_idx)
            else:
                rand_idx = np.arange(0, annot_length - seq_len + 1)
                np.random.shuffle(rand_idx)
                start_idx = rand_idx[0]

                annot_idx = np.arange(start_idx, start_idx + seq_len, 1)
                frame_idx = np.asarray(
                    [int(vid['file_names'][idx][-9:-4]) for idx in annot_idx])

            has_targets = all([
                self.target_in_frame(target, annot_id, true_on_reference=True)
                for annot_id in annot_idx
            ])
            if has_targets: break
            if return_on_failure: return None
            # print("Not all frame of video %s[%d-%d] has targets, re-selecting video." %
            #       (vid['file_names'][0].split('/')[0], start_idx, start_idx + frm_len))
            index = np.random.randint(len(self))
            vid_id = self.ids[index]

        frame_results = []
        extra_data = []

        while True:
            try:
                for idx, (frame_id, annot_id) in enumerate(
                        zip(frame_idx.tolist(), annot_idx.tolist())):
                    extra = {}
                    # FIXME: little bit hacky for full frames, maybe fix this using annotation files
                    frame_id_str = "%05d" % frame_id
                    file_name = vid['file_names'][0]
                    file_name = file_name[:-9] + frame_id_str + file_name[-4:]
                    prev_frame_id = frame_idx[idx - 1] if idx > 0 else -1
                    prev_annot_id = annot_idx[idx - 1] if idx > 0 else -1
                    if idx == 0:
                        seeds, (im, gt, masks, h, w,
                                num_crowds) = self.pull_frame(
                                    vid_name, (frame_id, annot_id),
                                    (prev_frame_id, prev_annot_id),
                                    file_name,
                                    target,
                                    num_crowds,
                                    require_seeds=True)
                    else:
                        im, gt, masks, h, w, num_crowds = self.pull_frame(
                            vid_name, (frame_id, annot_id),
                            (prev_frame_id, prev_annot_id),
                            file_name,
                            target,
                            num_crowds,
                            seeds=seeds)

                    extra['idx'] = (
                        frame_id,
                        annot_id,
                    )
                    frame_results.append((
                        im,
                        gt,
                        masks,
                        h,
                        w,
                        num_crowds,
                    ))
                    extra_data.append(extra)
            except ValueError as e:
                logger = logging.getLogger("yolact.dataset")
                logger.warning('Resampling with reseed signal...')
                frame_results.clear()
                extra_data.clear()
                continue
            break

        return frame_results, extra_data

    def __len__(self):
        return len(self.ids)

    @staticmethod
    def target_in_frame(target, frame_id, true_on_reference=False):
        if frame_id < 0:
            return true_on_reference
        if len(target) > 0:
            for obj in target:
                if obj['segmentations'][frame_id] is not None:
                    return True
        return False

    def pull_frame(self,
                   vid_name,
                   frame_annot_id,
                   prev_frame_annot_id,
                   file_name,
                   target,
                   num_crowds,
                   require_seeds=False,
                   seeds=None):
        frame_id, annot_id = frame_annot_id
        prev_frame_id, prev_annot_id = prev_frame_annot_id
        path = osp.join(self.root, file_name)
        assert osp.exists(path), 'Image path does not exist: {}'.format(path)

        img = cv2.imread(path)
        height, width, _ = img.shape

        target_is_in_frame = self.target_in_frame(target, annot_id)

        if target_is_in_frame:
            # Pool all the masks for this image into one [num_objects,height,width] matrix

            # masks = [np.zeros(height * width, dtype=np.uint8).reshape(-1) if obj['segmentations'][frame_id] is None  # all-zero mask on None
            #          else self.coco.annToMask(obj, frame_id).reshape(-1) for obj in target]
            masks = [
                self.coco.annToMask(obj, annot_id).reshape(-1)
                for obj in target if obj['segmentations'][annot_id] is not None
            ]
            masks = np.vstack(masks)
            masks = masks.reshape(-1, height, width)

        if self.target_transform is not None and target_is_in_frame:
            target = self.target_transform(target, annot_id, width, height)

        if self.transform is not None:
            if "Video" in type(self.transform).__name__:
                if target_is_in_frame:
                    target = np.array(target)
                    return_transform = self.transform(
                        img,
                        masks,
                        target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        },
                        require_seeds=require_seeds,
                        seeds=seeds)

                    if require_seeds:
                        seeds, (img, masks, boxes, labels) = return_transform
                    else:
                        img, masks, boxes, labels = return_transform

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))

                    if target.shape[0] == 0:
                        logger = logging.getLogger("yolact.dataset")
                        logger.warning(
                            'Augmentation output an example with no ground truth. Resampling...'
                        )
                        raise ValueError("reseed")
                else:
                    try:
                        return_transform = self.transform(
                            img,
                            np.zeros((1, height, width), dtype=np.float),
                            np.array([[0., 0., 1., 1.]]), {
                                'num_crowds': 0,
                                'labels': np.array([0])
                            },
                            require_seeds=require_seeds,
                            seeds=seeds)
                    except ValueError:
                        assert False, "Unexpected reseed captured with no-target instances."

                    if require_seeds:
                        seeds, (img, _, _, _) = return_transform
                    else:
                        img, _, _, _ = return_transform

                    masks = None
                    target = None
            else:
                if target_is_in_frame:
                    target = np.array(target)
                    img, masks, boxes, labels = self.transform(
                        img, masks, target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        })

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
                else:
                    img, _, _, _ = self.transform(
                        img, np.zeros((1, height, width), dtype=np.float),
                        np.array([[0, 0, 1, 1]]), {
                            'num_crowds': 0,
                            'labels': np.array([0])
                        })
                    masks = None
                    target = None

        return_tuple = torch.from_numpy(img).permute(
            2, 0, 1), target, masks, height, width, num_crowds
        if require_seeds:
            return seeds, return_tuple
        else:
            return return_tuple

    def pull_image(self, index):
        '''Returns the original image object at index in PIL form

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to show
        Return:
            cv2 img
        '''
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]['file_name']
        return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)

    def pull_anno(self, index):
        '''Returns the original annotation of image at index

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to get annotation of
        Return:
            list:  [img_id, [(label, bbox coords),...]]
                eg: ('001718', [('dog', (96, 13, 438, 332))])
        '''
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        return self.coco.loadAnns(ann_ids)

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(
            tmp,
            self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(
            tmp,
            self.target_transform.__repr__().replace('\n',
                                                     '\n' + ' ' * len(tmp)))
        return fmt_str