Exemplo n.º 1
0
 def get_ground_truth(self, anchors, bbox_preds, targets):
     anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
     N = len(anchors)
     # list[Tensor(R, 4)], one for each image
     all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4)
     # Boxes(Tensor(N*R, 4))
     box_delta = cat(bbox_preds, dim=1)
     # box_pred: xyxy; targets: xyxy
     box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors)
     indices = self.matcher(box_pred, all_anchors, targets)
     return indices
Exemplo n.º 2
0
    def inference_single_image(self,
                               conf_pred_per_image,
                               loc_pred_per_image,
                               default_boxes,
                               image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Args:
            conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size [Hi x Wi x D, C].
            loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except
                that C becomes 4.
            default_boxes (list['Boxes']):  a list of 'Boxes' elements.
                The Boxes contains default boxes of one image on the specific feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        # predict confidence
        conf_pred = torch.cat(conf_pred_per_image, dim=0)  # [R, C]
        conf_pred = conf_pred.softmax(dim=1)

        # predict boxes
        loc_pred = torch.cat(loc_pred_per_image, dim=0)  # [R, 4]
        default_boxes = Boxes.cat(default_boxes)  # [R, 4]
        boxes_pred = self.box2box_transform.apply_deltas(
            loc_pred, default_boxes.tensor)

        num_boxes, num_classes = conf_pred.shape
        boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand(
            num_boxes, num_classes, 4)  # [R, C, 4]
        labels = torch.arange(num_classes, device=self.device)  # [0, ..., C]
        labels = labels.view(1, num_classes).expand_as(conf_pred)  # [R, C]

        # remove predictions with the background label
        boxes_pred = boxes_pred[:, :-1]
        conf_pred = conf_pred[:, :-1]
        labels = labels[:, :-1]

        # batch everything, by making every class prediction be a separate instance
        boxes_pred = boxes_pred.reshape(-1, 4)
        conf_pred = conf_pred.reshape(-1)
        labels = labels.reshape(-1)

        # remove low scoring boxes
        indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1)
        boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices]

        keep = generalized_batched_nms(boxes_pred, conf_pred, labels,
                                       self.nms_threshold, nms_type=self.nms_type)

        keep = keep[:self.max_detections_per_image]
        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_pred[keep])
        result.scores = conf_pred[keep]
        result.pred_classes = labels[keep]
        return result
Exemplo n.º 3
0
    def get_ground_truth(self, anchors, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.

        Returns:
            gt_classes (Tensor):
                An integer tensor of shape (N, R) storing ground-truth
                labels for each anchor.
                R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels.
                Anchors with an IoU with some target higher than the foreground threshold
                are assigned their corresponding label in the [0, K-1] range.
                Anchors whose IoU are below the background threshold are assigned
                the label "K". Anchors whose IoU are between the foreground and background
                thresholds are assigned a label "-1", i.e. ignore.
            gt_anchors_deltas (Tensor):
                Shape (N, R, 4).
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                The values in the tensor are meaningful only when the corresponding
                anchor is labeled as foreground.
        """
        gt_classes = []
        gt_anchors_deltas = []
        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        # list[Tensor(R, 4)], one for each image

        for anchors_per_image, targets_per_image in zip(anchors, targets):
            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes,
                                                anchors_per_image)
            gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix)

            has_gt = len(targets_per_image) > 0
            if has_gt:
                # ground truth box regression
                matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs]
                gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
                    anchors_per_image.tensor, matched_gt_boxes.tensor
                )

                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
                # Anchors with label 0 are treated as background.
                gt_classes_i[anchor_labels == 0] = self.num_classes
                # Anchors with label -1 are ignored.
                gt_classes_i[anchor_labels == -1] = -1
            else:
                gt_classes_i = torch.zeros_like(
                    gt_matched_idxs) + self.num_classes
                gt_anchors_reg_deltas_i = torch.zeros_like(
                    anchors_per_image.tensor)

            gt_classes.append(gt_classes_i)
            gt_anchors_deltas.append(gt_anchors_reg_deltas_i)

        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
Exemplo n.º 4
0
    def _get_ground_truth(self):
        """
        Returns:
            gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
                total number of anchors in image i (i.e., len(anchors[i])). Label values are
                in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
            gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4).
        """
        gt_objectness_logits = []
        gt_anchor_deltas = []
        # Concatenate anchors from all feature maps into a single Boxes per image
        anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors]
        for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes,
                                                       anchors, self.gt_boxes):
            """
            image_size_i: (h, w) for the i-th image
            anchors_i: anchors for i-th image
            gt_boxes_i: ground-truth boxes for i-th image
            """
            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i,
                                                                   anchors_i)
            matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom(
                self.anchor_matcher)(match_quality_matrix)
            # Matching is memory-expensive and may result in CPU tensors. But the result is small
            gt_objectness_logits_i = gt_objectness_logits_i.to(
                device=gt_boxes_i.device)
            del match_quality_matrix

            if self.boundary_threshold >= 0:
                # Discard anchors that go out of the boundaries of the image
                # NOTE: This is legacy functionality that is turned off by default in cvpods
                anchors_inside_image = anchors_i.inside_box(
                    image_size_i, self.boundary_threshold)
                gt_objectness_logits_i[~anchors_inside_image] = -1

            if len(gt_boxes_i) == 0:
                # These values won't be used anyway since the anchor is labeled as background
                gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor)
            else:
                # TODO wasted computation for ignored boxes
                matched_gt_boxes = gt_boxes_i[matched_idxs]
                gt_anchor_deltas_i = self.box2box_transform.get_deltas(
                    anchors_i.tensor, matched_gt_boxes.tensor)

            gt_objectness_logits.append(gt_objectness_logits_i)
            gt_anchor_deltas.append(gt_anchor_deltas_i)

        return gt_objectness_logits, gt_anchor_deltas
Exemplo n.º 5
0
def point_sample_fine_grained_features(features_list, feature_scales, boxes,
                                       point_coords):
    """
    Get features from feature maps in `features_list` that correspond to specific point coordinates
        inside each bounding box from `boxes`.

    Args:
        features_list (list[Tensor]): A list of feature map tensors to get features from.
        feature_scales (list[float]): A list of scales for tensors in `features_list`.
        boxes (list[Boxes]): A list of I Boxes  objects that contain R_1 + ... + R_I = R boxes all
            together.
        point_coords (Tensor): A tensor of shape (R, P, 2) that contains
            [0, 1] x [0, 1] box-normalized coordinates of the P sampled points.

    Returns:
        point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled
            from all features maps in feature_list for P sampled points for all R boxes in `boxes`.
        point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level
            coordinates of P points.
    """
    cat_boxes = Boxes.cat(boxes)
    num_boxes = [len(b) for b in boxes]

    point_coords_wrt_image = get_point_coords_wrt_image(
        cat_boxes.tensor, point_coords)
    split_point_coords_wrt_image = torch.split(point_coords_wrt_image,
                                               num_boxes)

    point_features = []
    for idx_img, point_coords_wrt_image_per_image in enumerate(
            split_point_coords_wrt_image):
        point_features_per_image = []
        for idx_feature, feature_map in enumerate(features_list):
            h, w = feature_map.shape[-2:]
            scale = torch.tensor(
                [w, h],
                device=feature_map.device) / feature_scales[idx_feature]
            point_coords_scaled = point_coords_wrt_image_per_image / scale
            point_features_per_image.append(
                point_sample(
                    feature_map[idx_img].unsqueeze(0),
                    point_coords_scaled.unsqueeze(0),
                    align_corners=False,
                ).squeeze(0).transpose(1, 0))
        point_features.append(cat(point_features_per_image, dim=1))

    return cat(point_features, dim=0), point_coords_wrt_image
Exemplo n.º 6
0
    def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes,
                  images):
        """
        Arguments:
            pred_logits, pred_deltas, pred_masks: Same as the output of:
                meth:`TensorMaskHead.forward`
            anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
            images (ImageList): the input images
        Returns:
            results (List[Instances]): a list of #images elements.
        """
        assert len(anchors) == len(images)
        results = []

        pred_logits = [
            permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits
        ]
        pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]

        pred_logits = cat(pred_logits, dim=1)
        pred_deltas = cat(pred_deltas, dim=1)

        for img_idx, (anchors_im,
                      indexes_im) in enumerate(zip(anchors, indexes)):
            # Get the size of the current image
            image_size = images.image_sizes[img_idx]

            logits_im = pred_logits[img_idx]
            deltas_im = pred_deltas[img_idx]

            if self.mask_on:
                masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
            else:
                masks_im = [None] * self.num_levels
            results_im = self.inference_single_image(
                logits_im,
                deltas_im,
                masks_im,
                Boxes.cat(anchors_im),
                cat(indexes_im),
                tuple(image_size),
            )
            results.append(results_im)
        return results
Exemplo n.º 7
0
    def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains unit lengths for anchors of
                this image on the specific feature level.
            indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains the 5D index of
                each anchor, the second dimension means (L, N, H, W, A), where L
                is level, I is image, H is height, W is width, and A is anchor.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.
        Returns:
            gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
                The first one is an integer tensor of shape (R, #classes) storing ground-truth
                labels for each anchor. R is the total number of anchors in the batch.
                The second one is an integer tensor of shape (R,), to indicate which
                anchors are valid for loss computation, which anchors are not.
            gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
                The first one, of shape (F, 4). F=#foreground anchors.
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                Only foreground anchors have values in this tensor. Could be `None` if F=0.
                The second one, of shape (R,), is an integer tensor indicating which anchors
                are foreground ones used for box regression. Could be `None` if F=0.
            gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
                The first one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the ground truth
                masks of the same size and for the same feature level. Could be `None`.
                The second one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the location of the ground truth
                masks of the same size and for the same feature level. The second dimension means
                (N, H, W), where N is image, H is height, and W is width. Could be `None`.
            num_fg (int): F=#foreground anchors, used later for loss normalization.
        """
        gt_classes = []
        gt_deltas = []
        gt_masks = [[[] for _ in range(self.num_anchors)]
                    for _ in range(self.num_levels)]
        gt_mask_inds = [[[] for _ in range(self.num_anchors)]
                        for _ in range(self.num_levels)]

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
        indexes = [cat(indexes_i) for indexes_i in indexes]

        num_fg = 0
        for i, (anchors_im, unit_lengths_im, indexes_im,
                targets_im) in enumerate(
                    zip(anchors, unit_lengths, indexes, targets)):
            # Initialize all
            gt_classes_i = torch.full_like(unit_lengths_im,
                                           self.num_classes,
                                           dtype=torch.int64,
                                           device=self.device)
            # Ground truth classes
            has_gt = len(targets_im) > 0
            if has_gt:
                # Compute the pairwise matrix
                gt_matched_inds, anchor_labels = _assignment_rule(
                    targets_im.gt_boxes,
                    anchors_im,
                    unit_lengths_im,
                    self.min_anchor_size,
                )
                # Find the foreground instances
                fg_inds = anchor_labels == 1
                fg_anchors = anchors_im[fg_inds]
                num_fg += len(fg_anchors)
                # Find the ground truths for foreground instances
                gt_fg_matched_inds = gt_matched_inds[fg_inds]
                # Assign labels for foreground instances
                gt_classes_i[fg_inds] = targets_im.gt_classes[
                    gt_fg_matched_inds]
                # Anchors with label -1 are ignored, others are left as negative
                gt_classes_i[anchor_labels == -1] = -1

                # Boxes
                # Ground truth box regression, only for foregrounds
                matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
                # Compute box regression offsets for foregrounds only
                gt_deltas_i = self.box2box_transform.get_deltas(
                    fg_anchors.tensor, matched_gt_boxes.tensor)
                gt_deltas.append(gt_deltas_i)

                # Masks
                if self.mask_on:
                    # Compute masks for each level and each anchor
                    matched_indexes = indexes_im[fg_inds, :]
                    for lvl in range(self.num_levels):
                        ids_lvl = matched_indexes[:, 0] == lvl
                        if torch.any(ids_lvl):
                            cur_level_factor = 2**lvl if self.bipyramid_on else 1
                            for anc in range(self.num_anchors):
                                ids_lvl_anchor = ids_lvl & (
                                    matched_indexes[:, 4] == anc)
                                if torch.any(ids_lvl_anchor):
                                    gt_masks[lvl][anc].append(
                                        targets_im[
                                            gt_fg_matched_inds[ids_lvl_anchor]]
                                        .gt_masks.crop_and_resize(
                                            fg_anchors[ids_lvl_anchor].tensor,
                                            self.mask_sizes[anc] *
                                            cur_level_factor,
                                        ))
                                    # Select (N, H, W) dimensions
                                    gt_mask_inds_lvl_anc = matched_indexes[
                                        ids_lvl_anchor, 1:4]
                                    # Set the image index to the current image
                                    gt_mask_inds_lvl_anc[:, 0] = i
                                    gt_mask_inds[lvl][anc].append(
                                        gt_mask_inds_lvl_anc)
            gt_classes.append(gt_classes_i)

        # Classes and boxes
        gt_classes = cat(gt_classes)
        gt_valid_inds = gt_classes >= 0
        gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
        gt_classes_target = torch.zeros(
            (gt_classes.shape[0], self.num_classes),
            dtype=torch.float32,
            device=self.device,
        )
        gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
        gt_deltas = cat(gt_deltas) if gt_deltas else None

        # Masks
        gt_masks = [[cat(mla) if mla else None for mla in ml]
                    for ml in gt_masks]
        gt_mask_inds = [[cat(ila) if ila else None for ila in il]
                        for il in gt_mask_inds]
        return (
            (gt_classes_target, gt_valid_inds),
            (gt_deltas, gt_fg_inds),
            (gt_masks, gt_mask_inds),
            num_fg,
        )
Exemplo n.º 8
0
    def losses(self, indices, gt_instances, anchors, pred_class_logits,
               pred_anchor_deltas):
        pred_class_logits = cat(pred_class_logits,
                                dim=1).view(-1, self.num_classes)
        pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4)

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        N = len(anchors)
        # list[Tensor(R, 4)], one for each image
        all_anchors = Boxes.cat(anchors).tensor
        # Boxes(Tensor(N*R, 4))
        predicted_boxes = self.box2box_transform.apply_deltas(
            pred_anchor_deltas, all_anchors)
        predicted_boxes = predicted_boxes.reshape(N, -1, 4)

        ious = []
        pos_ious = []
        for i in range(N):
            src_idx, tgt_idx = indices[i]
            iou, _ = box_iou(predicted_boxes[i, ...],
                             gt_instances[i].gt_boxes.tensor)
            if iou.numel() == 0:
                max_iou = iou.new_full((iou.size(0), ), 0)
            else:
                max_iou = iou.max(dim=1)[0]
            a_iou, _ = box_iou(anchors[i].tensor,
                               gt_instances[i].gt_boxes.tensor)
            if a_iou.numel() == 0:
                pos_iou = a_iou.new_full((0, ), 0)
            else:
                pos_iou = a_iou[src_idx, tgt_idx]
            ious.append(max_iou)
            pos_ious.append(pos_iou)
        ious = torch.cat(ious)
        ignore_idx = ious > self.neg_ignore_thresh
        pos_ious = torch.cat(pos_ious)
        pos_ignore_idx = pos_ious < self.pos_ignore_thresh

        src_idx = torch.cat([
            src + idx * anchors[0].tensor.shape[0]
            for idx, (src, _) in enumerate(indices)
        ])
        gt_classes = torch.full(pred_class_logits.shape[:1],
                                self.num_classes,
                                dtype=torch.int64,
                                device=pred_class_logits.device)
        gt_classes[ignore_idx] = -1
        target_classes_o = torch.cat(
            [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)])
        target_classes_o[pos_ignore_idx] = -1
        gt_classes[src_idx] = target_classes_o

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        if comm.get_world_size() > 1:
            dist.all_reduce(num_foreground)
        num_foreground = num_foreground * 1.0 / comm.get_world_size()

        # cls loss
        loss_cls = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        )
        # reg loss
        target_boxes = torch.cat(
            [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)],
            dim=0)
        target_boxes = target_boxes[~pos_ignore_idx]
        matched_predicted_boxes = predicted_boxes.reshape(
            -1, 4)[src_idx[~pos_ignore_idx]]
        loss_box_reg = (1 - torch.diag(
            generalized_box_iou(matched_predicted_boxes, target_boxes))).sum()

        return {
            "loss_cls": loss_cls / max(1, num_foreground),
            "loss_box_reg": loss_box_reg / max(1, num_foreground),
        }
Exemplo n.º 9
0
    def losses(self, anchors, gt_instances, box_cls, box_delta):
        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]

        box_cls_flattened = [
            permute_to_N_HWA_K(x, self.num_classes) for x in box_cls
        ]
        box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
        pred_class_logits = cat(box_cls_flattened, dim=1)
        pred_anchor_deltas = cat(box_delta_flattened, dim=1)

        pred_class_probs = pred_class_logits.sigmoid()
        pred_box_probs = []
        num_foreground = 0
        positive_losses = []
        for anchors_per_image, \
            gt_instances_per_image, \
            pred_class_probs_per_image, \
            pred_anchor_deltas_per_image in zip(
                anchors, gt_instances, pred_class_probs, pred_anchor_deltas):
            gt_classes_per_image = gt_instances_per_image.gt_classes

            with torch.no_grad():
                # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4]
                predicted_boxes_per_image = self.box2box_transform.apply_deltas(
                    pred_anchor_deltas_per_image, anchors_per_image.tensor)
                # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j]
                gt_pred_iou = pairwise_iou(gt_instances_per_image.gt_boxes,
                                           Boxes(predicted_boxes_per_image))

                t1 = self.bbox_threshold
                t2 = gt_pred_iou.max(dim=1, keepdim=True).values.clamp_(
                    min=t1 + torch.finfo(torch.float32).eps)
                # gt_pred_prob: P{a_{j} -> b_{i}}, shape: [i, j]
                gt_pred_prob = ((gt_pred_iou - t1) / (t2 - t1)).clamp_(min=0,
                                                                       max=1)

                # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c]
                nonzero_idxs = torch.nonzero(gt_pred_prob, as_tuple=True)
                pred_box_prob_per_image = torch.zeros_like(
                    pred_class_probs_per_image)
                pred_box_prob_per_image[nonzero_idxs[1], gt_classes_per_image[nonzero_idxs[0]]] \
                    = gt_pred_prob[nonzero_idxs]
                pred_box_probs.append(pred_box_prob_per_image)

            # construct bags for objects
            match_quality_matrix = pairwise_iou(
                gt_instances_per_image.gt_boxes, anchors_per_image)
            _, foreground_idxs = torch.topk(match_quality_matrix,
                                            self.pos_anchor_topk,
                                            dim=1,
                                            sorted=False)

            # matched_pred_class_probs_per_image: P_{ij}^{cls}
            matched_pred_class_probs_per_image = torch.gather(
                pred_class_probs_per_image[foreground_idxs], 2,
                gt_classes_per_image.view(-1, 1,
                                          1).repeat(1, self.pos_anchor_topk,
                                                    1)).squeeze(2)

            # matched_gt_anchor_deltas_per_image: P_{ij}^{loc}
            matched_gt_anchor_deltas_per_image = self.box2box_transform.get_deltas(
                anchors_per_image.tensor[foreground_idxs],
                gt_instances_per_image.gt_boxes.tensor.unsqueeze(1))
            loss_box_reg = smooth_l1_loss(
                pred_anchor_deltas_per_image[foreground_idxs],
                matched_gt_anchor_deltas_per_image,
                beta=self.smooth_l1_loss_beta,
                reduction="none").sum(dim=-1) * self.reg_weight
            matched_pred_reg_probs_per_image = (-loss_box_reg).exp()

            # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) }
            num_foreground += len(gt_instances_per_image)
            positive_losses.append(
                positive_bag_loss(matched_pred_class_probs_per_image *
                                  matched_pred_reg_probs_per_image,
                                  dim=1))

        # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B||
        positive_loss = torch.cat(positive_losses).sum() / max(
            1, num_foreground)

        # pred_box_probs: P{a_{j} \in A_{+}}
        pred_box_probs = torch.stack(pred_box_probs, dim=0)
        # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B||
        negative_loss = negative_bag_loss(
            pred_class_probs *
            (1 - pred_box_probs), self.focal_loss_gamma).sum() / max(
                1, num_foreground * self.pos_anchor_topk)

        loss_pos = positive_loss * self.focal_loss_alpha
        loss_neg = negative_loss * (1 - self.focal_loss_alpha)

        return {"loss_pos": loss_pos, "loss_neg": loss_neg}
Exemplo n.º 10
0
    def get_ground_truth(self, default_boxes, targets):
        """
        Args:
            default_boxes (list[Boxes]): a list of 'Boxes' elements.
                The Boxes contains default boxes of one image on the specific feature level.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.

        Returns:
            gt_conf (Tensor):
                An integer tensor of shape [N, R] storing ground-truth labels for each default box.
                R is the total number of default box, i.e. the sum of Hi x Wi x D for all levels.

                * Default box with an IoU with some target higher than the foreground threshold
                are assigned their corresponding label in the [0, C-1] range.
                * Default box whose IoU are below the background threshold are assigned
                the label "C".
                * Default box whose IoU are between the foreground and background
                thresholds are assigned a label "-1", i.e. ignore.

            gt_default_boxes_deltas (Tensor): Shape [N, R, 4].
                The last dimension represents ground-truth box2box transform targets
                (g^cx, g^cy, g^w, g^h)that map each default box to its matched ground-truth box.
                The values in the tensor are meaningful only when the corresponding default box
                is labeled as foreground.
        """
        gt_conf = list()
        gt_default_boxes_deltas = list()
        # list[Tensor(R, 4)], one for each image
        default_boxes_per_image = Boxes.cat(default_boxes)

        # each Instances (for one image)
        for targets_per_image in targets:
            match_quality_matrix = pairwise_iou(
                targets_per_image.gt_boxes, default_boxes_per_image)  # M * N
            gt_matched_idxs, default_box_labels = self.matcher(
                match_quality_matrix)

            has_gt = len(targets_per_image) > 0
            if has_gt:
                # ground truth box regression
                matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs]

                # meaningful only when the corresponding default box is labeled as foreground.
                gt_default_boxes_deltas_i = self.box2box_transform.get_deltas(
                    default_boxes_per_image.tensor, matched_gt_boxes.tensor)

                gt_conf_i = targets_per_image.gt_classes[gt_matched_idxs]
                # Anchors with label 0 are treated as background.
                gt_conf_i[default_box_labels == 0] = self.num_classes
                # Anchors with label -1 are ignored.
                gt_conf_i[default_box_labels == -1] = -1
            else:
                gt_conf_i = torch.zeros_like(
                    gt_matched_idxs) + self.num_classes
                gt_default_boxes_deltas_i = torch.zeros_like(
                    default_boxes_per_image.tensor)

            gt_conf.append(gt_conf_i)
            gt_default_boxes_deltas.append(gt_default_boxes_deltas_i)

        return torch.stack(gt_conf), torch.stack(gt_default_boxes_deltas)
Exemplo n.º 11
0
    def get_ground_truth(self, anchors, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.

        Returns:
            gt_classes (Tensor):
                An integer tensor of shape (N, R) storing ground-truth
                labels for each anchor.
                R is the total number of anchors, i.e. the sum of Hi x Wi for all levels.
                Anchors with an IoU with some target higher than the foreground threshold
                are assigned their corresponding label in the [0, K-1] range.
                Anchors whose IoU are below the background threshold are assigned
                the label "K". Anchors whose IoU are between the foreground and background
                thresholds are assigned a label "-1", i.e. ignore.
            gt_anchors_deltas (Tensor):
                Shape (N, R, 4).
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                The values in the tensor are meaningful only when the corresponding
                anchor is labeled as foreground.
        """
        gt_classes = []
        gt_anchors_deltas = []

        num_fg = 0
        num_gt = 0

        for anchors_per_image, targets_per_image in zip(anchors, targets):
            anchors_per_image = Boxes.cat(anchors_per_image)

            gt_boxes = targets_per_image.gt_boxes

            match_quality_matrix = pairwise_iou(gt_boxes, anchors_per_image)

            _, is_positive = match_quality_matrix.topk(self.iou_topk, dim=1)
            is_foreground = torch.zeros_like(match_quality_matrix,
                                             dtype=torch.bool).scatter_(
                                                 1, is_positive, True)

            match_quality_matrix[~is_foreground] = -1

            # if there are still more than one objects for a position,
            # we choose the one with maximum quality
            anchor_labels, gt_matched_idxs = match_quality_matrix.max(dim=0)

            num_fg += (anchor_labels != -1).sum().item()
            num_gt += len(targets_per_image)

            # ground truth box regression
            gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
                anchors_per_image.tensor, gt_boxes[gt_matched_idxs].tensor)

            # ground truth classes
            has_gt = len(targets_per_image) > 0
            if has_gt:
                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
                # Anchors with label -1 are treated as background.
                gt_classes_i[anchor_labels == -1] = self.num_classes
            else:
                gt_classes_i = torch.zeros_like(
                    gt_matched_idxs) + self.num_classes

            gt_classes.append(gt_classes_i)
            gt_anchors_deltas.append(gt_anchors_reg_deltas_i)

        get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt)

        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
Exemplo n.º 12
0
    def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits,
               pred_anchor_deltas, anchors):
        """
        Args:
            For `gt_classes` and `gt_anchors_deltas` parameters, see
                :meth:`FCOS.get_ground_truth`.
            Their shapes are (N, R) and (N, R, 4), respectively, where R is
            the total number of anchors across levels, i.e. sum(Hi x Wi)
            For `pred_class_logits` and `pred_anchor_deltas`, see
                :meth:`FCOSHead.forward`.

        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls" and "loss_box_reg"
        """
        pred_class_logits, pred_anchor_deltas = \
            permute_all_cls_and_box_to_N_HWA_K_and_concat(
                pred_class_logits, pred_anchor_deltas, self.num_classes
            )  # Shapes: (N x R, K) and (N x R, 4), respectively.

        gt_classes = gt_classes.flatten()
        gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        num_foreground = comm.all_reduce(num_foreground) / float(
            comm.get_world_size())

        # logits loss
        loss_cls = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        ) / max(1.0, num_foreground)

        anchors = Boxes.cat([Boxes.cat(anchors_i) for anchors_i in anchors])
        pred_anchor_deltas = self.box2box_transform.apply_deltas(
            pred_anchor_deltas, anchors.tensor)
        gt_anchors_deltas = self.box2box_transform.apply_deltas(
            gt_anchors_deltas, anchors.tensor)

        # regression loss
        loss_box_reg = iou_loss(
            pred_anchor_deltas[foreground_idxs],
            gt_anchors_deltas[foreground_idxs],
            box_mode="xyxy",
            loss_type=self.iou_loss_type,
            reduction="sum",
        ) / max(1.0, num_foreground) * self.reg_weight

        return {
            "loss_cls": loss_cls,
            "loss_box_reg": loss_box_reg,
        }
Exemplo n.º 13
0
 def test_empty_cat(self):
     x = Boxes.cat([])
     self.assertTrue(x.tensor.shape, (0, 4))