Exemplo n.º 1
0
def convert_boxlist(maskrcnn_boxlist):
    box_tensor = maskrcnn_boxlist.bbox
    size = maskrcnn_boxlist.size
    mode = maskrcnn_boxlist.mode
    bbox = BoxList(box_tensor, size, mode)
    for field in maskrcnn_boxlist.fields():
        bbox.add_field(field, maskrcnn_boxlist.get_field(field))
    return bbox
Exemplo n.º 2
0
    def forward_for_single_feature_map(self, anchors, objectness,
                                       box_regression):
        """
        Arguments:
            anchors: list[BoxList]
            objectness: tensor of size N, A, H, W
            box_regression: tensor of size N, A * 4, H, W
        """
        device = objectness.device
        N, A, H, W = objectness.shape

        # put in the same format as anchors
        objectness = objectness.permute(0, 2, 3, 1).reshape(N, -1)
        objectness = objectness.sigmoid()
        box_regression = box_regression.view(N, -1, 4, H,
                                             W).permute(0, 3, 4, 1, 2)
        box_regression = box_regression.reshape(N, -1, 4)

        num_anchors = A * H * W

        pre_nms_top_n = min(self.pre_nms_top_n, num_anchors)
        objectness, topk_idx = objectness.topk(pre_nms_top_n,
                                               dim=1,
                                               sorted=True)

        batch_idx = torch.arange(N, device=device)[:, None]
        box_regression = box_regression[batch_idx, topk_idx]

        image_shapes = [box.size for box in anchors]
        concat_anchors = torch.cat([a.bbox for a in anchors], dim=0)
        concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx]

        proposals = self.box_coder.decode(box_regression.view(-1, 4),
                                          concat_anchors.view(-1, 4))

        proposals = proposals.view(N, -1, 4)

        result = []
        for proposal, score, im_shape in zip(proposals, objectness,
                                             image_shapes):
            boxlist = BoxList(proposal, im_shape, mode="xyxy")
            boxlist.add_field("objectness", score)
            boxlist = boxlist.clip_to_image(remove_empty=False)
            boxlist = remove_small_boxes(boxlist, self.min_size)
            boxlist = boxlist_nms(
                boxlist,
                self.nms_thresh,
                max_proposals=self.post_nms_top_n,
                score_field="objectness",
            )
            result.append(boxlist)
        return result
Exemplo n.º 3
0
    def get_objects(self, idx, im_w, im_h):
        obj_boxes = self.return_null_box(im_w, im_h)
        if hasattr(self, 'det_objects'):
            boxes, box_score = self.det_objects[idx]

            if len(box_score) == 0:
                return obj_boxes
            obj_boxes_tensor = torch.as_tensor(boxes).reshape(-1, 4)
            obj_boxes = BoxList(obj_boxes_tensor, (im_w, im_h),
                                mode="xywh").convert("xyxy")

            scores = torch.as_tensor(box_score)
            obj_boxes.add_field("scores", scores)

        return obj_boxes
Exemplo n.º 4
0
def prepare_pooled_feature(x_pooled, boxes, detach=True):
    image_shapes = [box.size for box in boxes]
    boxes_per_image = [len(box) for box in boxes]
    box_tensors = [a.bbox for a in boxes]

    if detach:
        x_pooled = x_pooled.detach()
    pooled_feature = x_pooled.split(boxes_per_image, dim=0)

    boxes_result = []
    for feature_per_image, boxes_per_image, image_shape in zip(
            pooled_feature, box_tensors, image_shapes):
        boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy")
        boxlist.add_field("pooled_feature", feature_per_image)
        boxes_result.append(boxlist)
    return boxes_result
def construct_mm_proposals(imgs):
    bbox = torch.tensor([[0., 0., imgs.shape[2], imgs.shape[3]]],
                        dtype=torch.float32,
                        device=imgs.device)
    mimicking_proposals = [BoxList(bbox,
                                   [imgs.size(2), imgs.size(3)])] * len(imgs)
    return mimicking_proposals
    def __getitem__(self, idx):
        img, anno = super(COCODataset, self).__getitem__(idx)

        # filter crowd annotations
        # TODO might be better to add an extra field
        anno = [obj for obj in anno if obj["iscrowd"] == 0]

        boxes = [obj["bbox"] for obj in anno]
        boxes = torch.as_tensor(boxes).reshape(-1, 4)  # guard against no boxes
        target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")

        classes = [obj["category_id"] for obj in anno]
        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
        classes = torch.tensor(classes)
        target.add_field("labels", classes)

        masks = [obj["segmentation"] for obj in anno]
        masks = SegmentationMask(masks, img.size)
        target.add_field("masks", masks)

        target = target.clip_to_image(remove_empty=True)

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target, idx
 def prepare_boxlist(self, boxes, scores, image_shape):
     """
     Returns BoxList from `boxes` and adds probability scores information
     as an extra field
     `boxes` has shape (#detections, 4 * #classes), where each row represents
     a list of predicted bounding boxes for each of the object classes in the
     dataset (including the background class). The detections in each row
     originate from the same object proposal.
     `scores` has shape (#detection, #classes), where each row represents a list
     of object detection confidence scores for each of the object classes in the
     dataset (including the background class). `scores[i, j]`` corresponds to the
     box at `boxes[i, j * 4:(j + 1) * 4]`.
     """
     boxes = boxes.reshape(-1, 4)
     scores = scores.reshape(-1)
     boxlist = BoxList(boxes, image_shape, mode="xyxy")
     boxlist.add_field("scores", scores)
     return boxlist
Exemplo n.º 8
0
    def __getitem__(self, idx):

        _, clip_info = self.clips_info[idx]

        # mov_id is the id in self.movie_info
        mov_id, timestamp = clip_info
        # movie_id is the human-readable youtube id.
        movie_id, movie_size = self.movie_info[mov_id]
        video_data = self._decode_video_data(movie_id, timestamp)

        im_w, im_h = movie_size

        if self.det_persons is None:
            # Note: During training, we only use gt. Thus we should not provide box file,
            # otherwise we will use only box file instead.

            boxes, packed_act = self.anns[idx]

            boxes_tensor = torch.as_tensor(boxes, dtype=torch.float32).reshape(
                -1, 4)  # guard against no boxes
            boxes = BoxList(boxes_tensor, (im_w, im_h),
                            mode="xywh").convert("xyxy")

            # Decode the packed bits from uint8 to one hot, since AVA has 80 classes,
            # it can be exactly denoted with 10 bytes, otherwise we may need to discard some bits.
            one_hot_label = np.unpackbits(packed_act, axis=1)
            one_hot_label = torch.as_tensor(one_hot_label, dtype=torch.uint8)

            boxes.add_field("labels", one_hot_label)

        else:
            boxes, box_score = self.det_persons[idx]
            boxes_tensor = torch.as_tensor(boxes).reshape(-1, 4)
            boxes = BoxList(boxes_tensor, (im_w, im_h),
                            mode="xywh").convert("xyxy")

        boxes = boxes.clip_to_image(remove_empty=True)
        # extra fields
        extras = {}

        if self.transforms is not None:
            video_data, boxes, transform_randoms = self.transforms(
                video_data, boxes)
            slow_video, fast_video = video_data

            objects = None
            if self.det_objects is not None:
                objects = self.get_objects(idx, im_w, im_h)
            if self.object_transforms is not None:
                objects = self.object_transforms(objects, transform_randoms)

            # add infos neccessary for memory feature
            extras["movie_id"] = movie_id
            extras["timestamp"] = timestamp

            return slow_video, fast_video, boxes, objects, extras, idx

        return video_data, boxes, idx, movie_id, timestamp
    def forward(self, x, boxes):
        """
        Arguments:
            x (Tensor): the mask logits
            boxes (list[BoxList]): bounding boxes that are used as
                reference, one for ech image

        Returns:
            results (list[BoxList]): one BoxList for each image, containing
                the extra field mask
        """
        mask_prob = x.sigmoid()

        # select masks coresponding to the predicted classes
        num_masks = x.shape[0]
        labels = [bbox.get_field("labels") for bbox in boxes]
        labels = torch.cat(labels)
        index = torch.arange(num_masks, device=labels.device)
        mask_prob = mask_prob[index, labels][:, None]

        boxes_per_image = [len(box) for box in boxes]
        mask_prob = mask_prob.split(boxes_per_image, dim=0)

        if self.masker:
            mask_prob = self.masker(mask_prob, boxes)

        results = []
        for prob, box in zip(mask_prob, boxes):
            bbox = BoxList(box.bbox, box.size, mode="xyxy")
            for field in box.fields():
                bbox.add_field(field, box.get_field(field))
            bbox.add_field("mask", prob)
            results.append(bbox)

        return results
 def forward(self, image_list, feature_maps):
     grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
     anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
     anchors = []
     for i, (image_height, image_width) in enumerate(image_list.image_sizes):
         anchors_in_image = []
         for anchors_per_feature_map in anchors_over_all_feature_maps:
             boxlist = BoxList(
                 anchors_per_feature_map, (image_width, image_height), mode="xyxy"
             )
             self.add_visibility_to(boxlist)
             anchors_in_image.append(boxlist)
         anchors.append(anchors_in_image)
     return anchors
    def get_groundtruth(self, index):
        img_id = self.ids[index]
        anno = ET.parse(self._annopath % img_id).getroot()
        anno = self._preprocess_annotation(anno)

        height, width = anno["im_info"]
        target = BoxList(anno["boxes"], (width, height), mode="xyxy")
        target.add_field("labels", anno["labels"])
        target.add_field("difficult", anno["difficult"])
        return target
Exemplo n.º 12
0
    def _get_target(self, index):
        target = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
        objs = target['annotation']['object']
        size = target['annotation']['size']

        bboxs = []
        classes = []
        if not isinstance(objs, list):
            objs = [objs]
        for obj in objs:
            label = obj['name']
            bbox = obj['bndbox']
            xmin, ymin, xmax, ymax = int(bbox['xmin']), int(bbox['ymin']), int(bbox['xmax']), int(
                bbox['ymax'])
            bboxs.append((xmin, ymin, xmax, ymax))
            classes.append(label)

        target_raw = BoxList(bboxs, (int(size['width']), int(size['height'])), mode='xyxy')
        return target_raw, classes
    def filter_results(self, boxlist, num_classes):
        """Returns bounding-box detection results by thresholding on scores and
        applying non-maximum suppression (NMS).
        """
        # unwrap the boxlist to avoid additional overhead.
        # if we had multi-class NMS, we could perform this directly on the boxlist
        boxes = boxlist.bbox.reshape(-1, num_classes * 4)
        scores = boxlist.get_field("scores").reshape(-1, num_classes)

        device = scores.device
        result = []
        # Apply threshold on detection probabilities and apply NMS
        # Skip j = 0, because it's the background class
        inds_all = scores > self.score_thresh
        for j in range(1, num_classes):
            inds = inds_all[:, j].nonzero().squeeze(1)
            scores_j = scores[inds, j]
            boxes_j = boxes[inds, j * 4:(j + 1) * 4]
            boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
            boxlist_for_class.add_field("scores", scores_j)
            boxlist_for_class = boxlist_nms(boxlist_for_class,
                                            self.nms,
                                            score_field="scores")
            num_labels = len(boxlist_for_class)
            boxlist_for_class.add_field(
                "labels",
                torch.full((num_labels, ), j, dtype=torch.int64,
                           device=device))
            result.append(boxlist_for_class)

        result = cat_boxlist(result)
        number_of_detections = len(result)

        # Limit to max_per_image detections **over all classes**
        if number_of_detections > self.detections_per_img > 0:
            cls_scores = result.get_field("scores")
            image_thresh, _ = torch.kthvalue(
                cls_scores.cpu(),
                number_of_detections - self.detections_per_img + 1)
            keep = cls_scores >= image_thresh.item()
            keep = torch.nonzero(keep).squeeze(1)
            result = result[keep]
        return result
def evaluate_box_proposals(
    predictions, dataset, thresholds=None, area="all", limit=None
):
    """Evaluate detection proposal recall metrics. This function is a much
    faster alternative to the official COCO API recall evaluation code. However,
    it produces slightly different results.
    """
    # Record max overlap value for each gt box
    # Return vector of overlap values
    areas = {
        "all": 0,
        "small": 1,
        "medium": 2,
        "large": 3,
        "96-128": 4,
        "128-256": 5,
        "256-512": 6,
        "512-inf": 7,
    }
    area_ranges = [
        [0 ** 2, 1e5 ** 2],  # all
        [0 ** 2, 32 ** 2],  # small
        [32 ** 2, 96 ** 2],  # medium
        [96 ** 2, 1e5 ** 2],  # large
        [96 ** 2, 128 ** 2],  # 96-128
        [128 ** 2, 256 ** 2],  # 128-256
        [256 ** 2, 512 ** 2],  # 256-512
        [512 ** 2, 1e5 ** 2],
    ]  # 512-inf
    assert area in areas, "Unknown area range: {}".format(area)
    area_range = area_ranges[areas[area]]
    gt_overlaps = []
    num_pos = 0

    for image_id, prediction in enumerate(predictions):
        original_id = dataset.id_to_img_map[image_id]

        # TODO replace with get_img_info?
        image_width = dataset.coco.imgs[original_id]["width"]
        image_height = dataset.coco.imgs[original_id]["height"]
        prediction = prediction.resize((image_width, image_height))

        # sort predictions in descending order
        # TODO maybe remove this and make it explicit in the documentation
        inds = prediction.get_field("objectness").sort(descending=True)[1]
        prediction = prediction[inds]

        ann_ids = dataset.coco.getAnnIds(imgIds=original_id)
        anno = dataset.coco.loadAnns(ann_ids)
        gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0]
        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
        gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert(
            "xyxy"
        )
        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])

        if len(gt_boxes) == 0:
            continue

        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
        gt_boxes = gt_boxes[valid_gt_inds]

        num_pos += len(gt_boxes)

        if len(gt_boxes) == 0:
            continue

        if len(prediction) == 0:
            continue

        if limit is not None and len(prediction) > limit:
            prediction = prediction[:limit]

        overlaps = boxlist_iou(prediction, gt_boxes)

        _gt_overlaps = torch.zeros(len(gt_boxes))
        for j in range(min(len(prediction), len(gt_boxes))):
            # find which proposal box maximally covers each gt box
            # and get the iou amount of coverage for each gt box
            max_overlaps, argmax_overlaps = overlaps.max(dim=0)

            # find which gt box is 'best' covered (i.e. 'best' = most iou)
            gt_ovr, gt_ind = max_overlaps.max(dim=0)
            assert gt_ovr >= 0
            # find the proposal box that covers the best covered gt box
            box_ind = argmax_overlaps[gt_ind]
            # record the iou coverage of this gt box
            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
            assert _gt_overlaps[j] == gt_ovr
            # mark the proposal box and the gt box as used
            overlaps[box_ind, :] = -1
            overlaps[:, gt_ind] = -1

        # append recorded iou coverage level
        gt_overlaps.append(_gt_overlaps)
    gt_overlaps = torch.cat(gt_overlaps, dim=0)
    gt_overlaps, _ = torch.sort(gt_overlaps)

    if thresholds is None:
        step = 0.05
        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
    recalls = torch.zeros_like(thresholds)
    # compute recall for each iou threshold
    for i, t in enumerate(thresholds):
        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
    # ar = 2 * np.trapz(recalls, thresholds)
    ar = recalls.mean()
    return {
        "ar": ar,
        "recalls": recalls,
        "thresholds": thresholds,
        "gt_overlaps": gt_overlaps,
        "num_pos": num_pos,
    }
def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5):
    """Calculate precision and recall based on evaluation code of PASCAL VOC.
    This function calculates precision and recall of
    predicted bounding boxes obtained from a dataset which has :math:`N`
    images.
    The code is based on the evaluation code used in PASCAL VOC Challenge.
   """
    n_pos = defaultdict(int)
    score = defaultdict(list)
    match = defaultdict(list)
    for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists):
        pred_bbox = pred_boxlist.bbox.numpy()
        pred_label = pred_boxlist.get_field("labels").numpy()
        pred_score = pred_boxlist.get_field("scores").numpy()
        gt_bbox = gt_boxlist.bbox.numpy()
        gt_label = gt_boxlist.get_field("labels").numpy()
        gt_difficult = gt_boxlist.get_field("difficult").numpy()

        for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
            pred_mask_l = pred_label == l
            pred_bbox_l = pred_bbox[pred_mask_l]
            pred_score_l = pred_score[pred_mask_l]
            # sort by score
            order = pred_score_l.argsort()[::-1]
            pred_bbox_l = pred_bbox_l[order]
            pred_score_l = pred_score_l[order]

            gt_mask_l = gt_label == l
            gt_bbox_l = gt_bbox[gt_mask_l]
            gt_difficult_l = gt_difficult[gt_mask_l]

            n_pos[l] += np.logical_not(gt_difficult_l).sum()
            score[l].extend(pred_score_l)

            if len(pred_bbox_l) == 0:
                continue
            if len(gt_bbox_l) == 0:
                match[l].extend((0, ) * pred_bbox_l.shape[0])
                continue

            # VOC evaluation follows integer typed bounding boxes.
            pred_bbox_l = pred_bbox_l.copy()
            pred_bbox_l[:, 2:] += 1
            gt_bbox_l = gt_bbox_l.copy()
            gt_bbox_l[:, 2:] += 1
            iou = boxlist_iou(
                BoxList(pred_bbox_l, gt_boxlist.size),
                BoxList(gt_bbox_l, gt_boxlist.size),
            ).numpy()
            gt_index = iou.argmax(axis=1)
            # set -1 if there is no matching ground truth
            gt_index[iou.max(axis=1) < iou_thresh] = -1
            del iou

            selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
            for gt_idx in gt_index:
                if gt_idx >= 0:
                    if gt_difficult_l[gt_idx]:
                        match[l].append(-1)
                    else:
                        if not selec[gt_idx]:
                            match[l].append(1)
                        else:
                            match[l].append(0)
                    selec[gt_idx] = True
                else:
                    match[l].append(0)

    n_fg_class = max(n_pos.keys()) + 1
    prec = [None] * n_fg_class
    rec = [None] * n_fg_class

    for l in n_pos.keys():
        score_l = np.array(score[l])
        match_l = np.array(match[l], dtype=np.int8)

        order = score_l.argsort()[::-1]
        match_l = match_l[order]

        tp = np.cumsum(match_l == 1)
        fp = np.cumsum(match_l == 0)

        # If an element of fp + tp is 0,
        # the corresponding element of prec[l] is nan.
        prec[l] = tp / (fp + tp)
        # If n_pos[l] is 0, rec[l] is None.
        if n_pos[l] > 0:
            rec[l] = tp / n_pos[l]

    return prec, rec
Exemplo n.º 16
0
 def prepare_boxlist(self, boxes, scores, image_shape):
     boxlist = BoxList(boxes, image_shape, mode="xyxy")
     boxlist.add_field("scores", scores)
     return boxlist
Exemplo n.º 17
0
    def _compute_prediction(self):
        '''The main loop of action prediction worker

        The main task of this separate process is compute the action score.
        However it behaves differently depends on whether it is in realtime mode.
        In realtime mode, it will compute the action scores right after the feature update.
        In video mode, the prediction won't be done until an explicit call of compute_prediction()
        '''

        empty_flag = False

        for i in count():
            if self.stopped:
                print("Avaworker stopped")
                return
            # if all video data have been processed and compute_prediction() has been called
            # compute predictions
            if self.task_done == True and empty_flag:
                print("The input queue is empty. Start working on prediction")
                for center_timestamp, video_size, ids in tqdm(self.timestamps):
                    predictions = self.ava_predictor.compute_prediction(
                        center_timestamp // self.interval, video_size)
                    self.output_queue.put((predictions, center_timestamp, ids))
                print("Prediction is done.")
                self.output_queue.put("done")
                self._task_done.value = False

            try:
                extra, video_size = self.input_queue.get(timeout=1)
            except queue.Empty:
                continue
            except FileNotFoundError:
                continue

            if extra == "Done":
                empty_flag = True
                continue

            frame, cur_millis, boxes, scores, ids = extra

            self.frame_stack.append(frame)
            self.extra_stack.append((cur_millis, boxes, scores, ids))
            self.frame_stack = self.frame_stack[-self.frame_buffer_numbers:]
            self.extra_stack = self.extra_stack[-self.frame_buffer_numbers:]

            # Predict action once per interval
            if len(
                    self.frame_stack
            ) >= self.frame_buffer_numbers and cur_millis > self.last_milli + self.interval:
                self.last_milli = cur_millis
                frame_arr = np.stack(self.frame_stack)[..., ::-1]
                center_index = self.frame_buffer_numbers // 2
                center_timestamp, person_boxes, person_scores, person_ids = self.extra_stack[
                    center_index]

                if person_boxes is None or len(person_boxes) == 0:
                    continue

                kframe = self.frame_stack[center_index]
                center_timestamp = int(center_timestamp)

                video_data, _, transform_randoms = self.vid_transforms(
                    frame_arr, None)

                kframe_data = self.coco_det.image_preprocess(kframe)
                im_dim_list_k = kframe.shape[1], kframe.shape[0]
                im_dim_list_k = torch.FloatTensor(im_dim_list_k).repeat(1, 2)
                dets = self.coco_det.images_detection(kframe_data,
                                                      im_dim_list_k)
                if isinstance(dets, int) or dets.shape[0] == 0:
                    obj_boxes = torch.zeros((0, 4))
                else:
                    obj_boxes = dets[:, 1:5].cpu()
                obj_boxes = BoxList(obj_boxes, video_size,
                                    "xyxy").clip_to_image()

                person_box = BoxList(person_boxes, video_size,
                                     "xyxy").clip_to_image()

                self.ava_predictor.update_feature(
                    video_data, person_box, obj_boxes,
                    center_timestamp // self.interval, transform_randoms)

                if self.realtime:
                    predictions = self.ava_predictor.compute_prediction(
                        center_timestamp // self.interval, video_size)
                    #print(len(predictions.get_field("scores")), person_ids)
                    self.output_queue.put(
                        (predictions, center_timestamp, person_ids[:, 0]))
                else:
                    # if not realtime, timestamps will be saved and the predictions will be computed later.
                    self.timestamps.append(
                        (center_timestamp, video_size, person_ids[:, 0]))
Exemplo n.º 18
0
 def return_null_box(self, im_w, im_h):
     return BoxList(torch.zeros((0, 4)), (im_w, im_h), mode="xyxy")