예제 #1
0
def show_mined_patches(image_id, class_ids, dataloader, hardnegdata):
    # show mined patches
    image_to_show = get_image_from_dataloader(image_id, dataloader)
    # collect data
    boxes_one_image, labels_one_image, scores_one_image, anchor_boxes_one_image, transform_corners_one_image = [], [], [], [], []
    for a in hardnegdata:
        boxes_one_image.append(a["crop_position_xyxy"])
        anchor_boxes_one_image.append(a["anchor_position_xyxy"])
        scores_one_image.append(a["score"])
        labels_one_image.append(a["label_local"] *
                                (-1 if a["role"] == "neg" else 1))
        transform_corners_one_image.append(a["transform_corners"])
    scores_one_image = torch.tensor(scores_one_image, dtype=torch.float)
    boxes_one_image = cat_boxlist(boxes_one_image)
    anchor_boxes_one_image = cat_boxlist(anchor_boxes_one_image)
    labels_one_image = torch.tensor(labels_one_image, dtype=torch.long)
    transform_corners_one_image = torch.stack(transform_corners_one_image,
                                              dim=0)
    # show
    show_annotated_image(
        img=image_to_show,
        boxes=boxes_one_image,
        labels=labels_one_image,
        scores=scores_one_image,
        default_boxes=anchor_boxes_one_image,
        transform_corners=transform_corners_one_image,
        class_ids=class_ids,
        score_threshold=cfg.visualization.mining.score_threshold,
        max_dets=cfg.visualization.mining.max_detections,
        showfig=True)
def evaluate(dataloader,
             detector,
             cfg_maskrcnn,
             retrievalnet,
             opt,
             cfg_eval,
             cfg_visualization,
             is_cuda=False,
             logger_prefix="detector-retrieval"):
    logger = logging.getLogger(f"{logger_prefix}.evaluate")
    dataset_name = dataloader.get_name()
    dataset_scale = dataloader.get_eval_scale()
    logger.info("Starting to eval on {0}, scale {1}".format(
        dataset_name, dataset_scale))

    t_start_eval = time.time()
    detector.eval()
    retrievalnet.eval()

    ## setup retrievalnet
    # setting up the multi-scale parameters
    ms = [1]
    msp = 1
    if opt.retrieval_multiscale:
        ms = [1, 1. / math.sqrt(2), 1. / 2]
        if retrievalnet.meta[
                "pooling"] == "gem" and retrievalnet.whiten is None:
            msp = retrievalnet.pool.p.data.tolist()[0]
    #setup whitening
    if opt.retrieval_whitening_path is not None:
        logger.info("Whitening is precomputed, loading it from {0}".format(
            opt.retrieval_whitening_path))
        whitening_data = torch.load(opt.retrieval_whitening_path)

        if ( (opt.retrieval_multiscale and "ms" in whitening_data) or \
             (not opt.retrieval_multiscale and "ss" in whitening_data ) ):

            if opt.retrieval_multiscale:
                Lw = copy.deepcopy(whitening_data["ms"])
            else:
                Lw = copy.deepcopy(whitening_data["ss"])
        else:
            raise RuntimeError(
                "Whitening should be precomputed with the network")

        # convert whitening data to torch tensors
        Lw["m"], Lw["P"] = torch.from_numpy(Lw["m"]), torch.from_numpy(Lw["P"])
        if is_cuda:
            Lw["m"], Lw["P"] = Lw["m"].cuda(), Lw["P"].cuda()
    else:
        Lw = None

    with torch.no_grad(
    ):  # do evaluation in forward mode only (for speed and memory)
        # extract features from query images
        query_images, _, _ = dataloader.get_all_class_images(do_resize=False)
        if is_cuda:
            query_images = [img.cuda() for img in query_images]
        query_images = [img[0] for img in query_images
                        ]  # get rid of the batch dimension
        query_images = [
            resize_image_tensor(img, opt.retrieval_image_size)
            for img in query_images
        ]
        query_images = [dataloader.unnorm_image(img) for img in query_images]

        query_images_with_aug = []
        for im in query_images:
            query_images_with_aug.append(im)
            if not cfg_eval.class_image_augmentation:
                num_class_views = 1
            elif cfg_eval.class_image_augmentation == "rotation90":
                im90 = im.rot90(1, [1, 2])
                im180 = im90.rot90(1, [1, 2])
                im270 = im180.rot90(1, [1, 2])
                query_images_with_aug.append(im90)
                query_images_with_aug.append(im180)
                query_images_with_aug.append(im270)
                num_class_views = 4
            elif cfg_eval.class_image_augmentation == "horflip":
                im_flipped = im.flip(2)
                query_images_with_aug.append(im_flipped)
                num_class_views = 2
            else:
                raise RuntimeError(
                    f"Unknown value of class_image_augmentation: {cfg_eval.class_image_augmentation}"
                )
        query_images = query_images_with_aug

        query_vectors = extract_vectors_from_images(retrievalnet,
                                                    query_images,
                                                    ms=ms,
                                                    msp=msp)
        # apply whitening if defined
        if Lw is not None:
            query_vectors = whitenapply(query_vectors, Lw["m"], Lw["P"])
        query_vectors = torch.transpose(query_vectors, 0, 1)

        # prepare looping over all iamges
        iterator = make_iterator_extract_scores_from_images_batched(
            dataloader,
            detector,
            cfg_maskrcnn,
            logger,
            image_batch_size=cfg_eval.batch_size,
            is_cuda=is_cuda)

        boxes, labels, scores = [], [], []
        gt_boxes = []
        image_ids = []
        losses = OrderedDict()

        # loop over all dataset images
        num_evaluted_images = 0
        for data in iterator:
            image_id, boxes_one_image, image_pyramid, query_img_sizes, class_ids, initial_img_size = data
            image_ids.append(image_id)
            logger.info(f"Image {num_evaluted_images}: id {image_id}")

            num_evaluted_images += 1
            img_size_pyramid = [
                FeatureMapSize(img=img) for img in image_pyramid
            ]

            gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(
                image_id)
            gt_boxes.append(gt_boxes_one_image)

            # vizualize GT for debug
            if cfg_visualization.show_gt_boxes:
                visualizer.show_gt_boxes(image_id, gt_boxes_one_image,
                                         class_ids, dataloader)

            # decode image predictions
            # merge boxes_one_image, labels_one_image, scores_one_image from different pyramid layers
            boxes_one_image = cat_boxlist(boxes_one_image)
            # do NMS
            good_indices = nms(
                boxes_one_image,
                opt.nms_iou_threshold_detector_score,
                nms_score_threshold=opt.nms_score_threshold_detector_score)
            boxes_one_image = boxes_one_image[good_indices]

            # extract feature vectors from the predictions
            image_original = dataloader._transform_image(image_id,
                                                         do_augmentation=True,
                                                         hflip=False,
                                                         vflip=False)[0]
            if is_cuda:
                image_original = image_original.cuda()
            image_patches = crop_resize_image_patches(
                image_original,
                boxes_one_image,
                opt.retrieval_image_size,
                logger,
                unnorm_image=dataloader.unnorm_image,
                is_cuda=is_cuda)
            # filter out cases when failed to crop a box: outside of the image
            good_indices = [
                i for i, p in enumerate(image_patches) if p is not None
            ]
            if good_indices:
                # non empty
                image_patches = [p for p in image_patches if p is not None]
                boxes_one_image = boxes_one_image[good_indices]

                image_vectors = extract_vectors_from_images(retrievalnet,
                                                            image_patches,
                                                            ms=ms,
                                                            msp=msp)

                # compute class scores from image_vectors and query_vectors (already transposed)
                if Lw is not None:
                    # apply whitening if defined
                    image_vectors = whitenapply(image_vectors, Lw["m"],
                                                Lw["P"])
                scores_retrieval = torch.mm(query_vectors, image_vectors)

                num_queries = scores_retrieval.size(0)
                num_detections = scores_retrieval.size(1)
                list_of_active_label = torch.LongTensor(class_ids)
                if cfg_eval.class_image_augmentation:
                    list_of_active_label = torch.stack(
                        [list_of_active_label] * num_class_views, 1).view(-1)

                # take all labels for all boxes - will sort them by scores at eval
                scores_one_image = scores_retrieval.view(-1)
                boxes_one_image = cat_boxlist([boxes_one_image] * num_queries)
                labels_one_image = torch.stack([list_of_active_label] *
                                               num_detections,
                                               1).contiguous().view(-1)
                # add scores and labels: overwrite if existed
                boxes_one_image.add_field("labels", labels_one_image)
                boxes_one_image.add_field("scores", scores_one_image)

                # NMS using the retrieval scores
                good_indices = nms(
                    boxes_one_image,
                    cfg_eval.nms_iou_threshold,
                    nms_score_threshold=cfg_eval.nms_score_threshold,
                    do_separate_per_label=not cfg_eval.nms_across_classes)
                boxes_one_image = boxes_one_image[good_indices]
            else:
                boxes_one_image.add_field(
                    "labels",
                    torch.zeros(0,
                                dtype=torch.long,
                                device=boxes_one_image.bbox_xyxy.device))
                boxes_one_image.add_field(
                    "scores",
                    torch.zeros(0,
                                dtype=torch.float,
                                device=boxes_one_image.bbox_xyxy.device))

            boxes.append(boxes_one_image.cpu())

            if cfg_visualization.show_detections:
                # do not pass class_ids - this is already taken care of
                visualizer.show_detections(boxes_one_image,
                                           image_id,
                                           dataloader,
                                           cfg_visualization,
                                           class_ids=None)

    # normalize by number of steps
    for k in losses:
        losses[k] /= num_evaluted_images

    # Save detection if requested
    if cfg_visualization.path_to_save_detections:
        data = {
            "image_ids": image_ids,
            "boxes_xyxy": [bb.bbox_xyxy for bb in boxes],
            "labels": [bb.get_field("labels") for bb in boxes],
            "scores": [bb.get_field("scores") for bb in boxes],
            "gt_boxes_xyxy": [bb.bbox_xyxy for bb in gt_boxes],
            "gt_labels": [bb.get_field("labels") for bb in gt_boxes],
            "gt_difficults": [bb.get_field("difficult") for bb in gt_boxes]
        }

        dataset_name = dataloader.get_name()
        os.makedirs(cfg_visualization.path_to_save_detections, exist_ok=True)
        save_path = os.path.join(cfg_visualization.path_to_save_detections,
                                 dataset_name + "_detections.pth")
        torch.save(data, save_path)

    # compute mAP
    for mAP_iou_threshold in cfg_eval.mAP_iou_thresholds:
        logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold))
        ap_data = do_voc_evaluation(boxes,
                                    gt_boxes,
                                    iou_thresh=mAP_iou_threshold,
                                    use_07_metric=False)
        losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"]
        losses["mAPw@{:0.2f}".format(
            mAP_iou_threshold)] = ap_data["map_weighted"]
        losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"]
        losses["AP_joint_classes@{:0.2f}".format(
            mAP_iou_threshold)] = ap_data["ap_joint_classes"]

        # per class AP information
        for i_class, (ap, recall, n_pos) in enumerate(
                zip(ap_data["ap_per_class"], ap_data["recall_per_class"],
                    ap_data["n_pos"])):
            if not np.isnan(ap):
                assert i_class in class_ids, "Could not find class_id in the list of ids"
                logger.info(
                    "Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}"
                    .format(i_class, ap, n_pos, class_ids.index(i_class),
                            recall))

    # save timing
    losses["eval_time"] = (time.time() - t_start_eval)
    logger.info("Evaluated on {0}, scale {1}".format(dataset_name,
                                                     dataset_scale))
    print_meters(losses, logger)
    return losses
예제 #3
0
    def forward(self, feature_maps):
        """
        Args:
            feature_maps (Tensor[float], size b^A x d x h^A x w^A) - contains the feature map of the input image
            b^A - batch size
            d - feature dimensionality
            h^A - height of the feature map
            w^A - width of the feature map
​
        Returns:
                # here b^C is the class batch size, i.e., the number of class images contained in self.class_batch_size passed when creating this object
            output_localization (Tensor[float], size b^A x b^C x 4 x h^A x w^A) - the localization output w.r.t. the standard box encoding - computed by DetectionBoxCoder.build_loc_targets
            output_recognition (Tensor[float], size size b^A x b^C x 1 x h^A x w^A) - the recognition output for each of the classes - the correlation, linearly converted to [0, 1] segment, the higher the better match to the class
            output_recognition_transform_detached (Tensor[float], size b^A x b^C x 1 x h^A x w^A) - same to output_recognition, but with the computational graph detached from the transformation (for backward  that does not update the transofrmation - intended for the negatives)
            corner_coordinates (Tensor[float], size size b^A x b^C x 8 x h^A x w^A) - the corners of the default boxes after the transofrmation, datached from the computational graph, for visualisation only
        """
        # get dims
        batch_size = feature_maps.size(0)
        feature_dim = feature_maps.size(1)
        image_fm_size = FeatureMapSize(img=feature_maps)
        class_fm_size = FeatureMapSize(img=self.class_feature_maps)
        feature_dim_for_regression = class_fm_size.h * class_fm_size.w

        class_feature_dim = self.class_feature_maps.size(1)
        assert feature_dim == class_feature_dim, "Feature dimensionality of input={0} and class={1} feature maps has to equal".format(
            feature_dim, class_feature_dim)

        # L2-normalize the feature map
        feature_maps = normalize_feature_map_L2(feature_maps, 1e-5)

        # get correlations all to all
        corr_maps = torch.einsum("bfhw,afxy->abwhxy", self.class_feature_maps,
                                 feature_maps)
        # need to try to optimize this with opt_einsum: https://optimized-einsum.readthedocs.io/en/latest/
        # CAUTION: note the switch of dimensions hw to wh. This is done for compatability with the FeatureCorrelation class by Ignacio Rocco https://github.com/ignacio-rocco/ncnet/blob/master/lib/model.py (to be able to load their models)

        # reshape to have the correlation map of dimensions similar to the standard tensor for image feature maps
        corr_maps = corr_maps.contiguous().view(
            batch_size * self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)

        # compute the grids to resample corr maps
        resampling_grids_local_coord = self.aligner(corr_maps)

        # build classifications outputs
        cor_maps_for_recognition = corr_maps.contiguous().view(
            batch_size, self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)
        resampling_grids_local_coord = resampling_grids_local_coord.contiguous(
        ).view(batch_size, self.class_batch_size, image_fm_size.h,
               image_fm_size.w, self.aligner.out_grid_size.h,
               self.aligner.out_grid_size.w, 2)

        # need to recompute resampling_grids to [-1, 1] coordinates w.r.t. the feature maps to sample points with F.grid_sample
        # first get the list of boxes that corresponds to the receptive fields of the parameter regression network: box sizes are the receptive field sizes, stride is the network stride
        default_boxes_xyxy_wrt_fm = self.box_grid_generator_feature_map_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.to(
            resampling_grids_local_coord.device)
        resampling_grids_fm_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_fm)

        # covert to coordinates normalized to [-1, 1] (to be compatible with torch.nn.functional.grid_sample)
        resampling_grids_fm_coord_x = resampling_grids_fm_coord.narrow(
            -1, 0, 1)
        resampling_grids_fm_coord_y = resampling_grids_fm_coord.narrow(
            -1, 1, 1)
        resampling_grids_fm_coord_unit = torch.cat([
            resampling_grids_fm_coord_x / (image_fm_size.w - 1) * 2 - 1,
            resampling_grids_fm_coord_y / (image_fm_size.h - 1) * 2 - 1
        ],
                                                   dim=-1)
        # clamp to fit the image plane
        resampling_grids_fm_coord_unit = resampling_grids_fm_coord_unit.clamp(
            -1, 1)

        # extract and pool matches
        # # slower code:
        # matches_summed = self.resample_of_correlation_map_simple(cor_maps_for_recognition,
        #                                                          resampling_grids_fm_coord_unit,
        #                                                          self.class_pool_mask)

        # we use faster, but somewhat more obscure version
        matches_summed = self.resample_of_correlation_map_fast(
            cor_maps_for_recognition, resampling_grids_fm_coord_unit,
            self.class_pool_mask)
        if matches_summed.requires_grad:
            matches_summed_transform_detached = self.resample_of_correlation_map_fast(
                cor_maps_for_recognition,
                resampling_grids_fm_coord_unit.detach(), self.class_pool_mask)
        else:
            # Optimization to make eval faster
            matches_summed_transform_detached = matches_summed

        # build localization targets
        default_boxes_xyxy_wrt_image = self.box_grid_generator_image_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.to(
            resampling_grids_local_coord.device)
        resampling_grids_image_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_image)

        num_pooled_points = self.aligner.out_grid_size.w * self.aligner.out_grid_size.h
        resampling_grids_x = resampling_grids_image_coord.narrow(
            -1, 0, 1).contiguous().view(-1, num_pooled_points)
        resampling_grids_y = resampling_grids_image_coord.narrow(
            -1, 1, 1).contiguous().view(-1, num_pooled_points)
        class_boxes_xyxy = torch.stack([
            resampling_grids_x.min(dim=1)[0],
            resampling_grids_y.min(dim=1)[0],
            resampling_grids_x.max(dim=1)[0],
            resampling_grids_y.max(dim=1)[0]
        ], 1)

        # extract rectangle borders to draw complete boxes
        corner_coordinates = resampling_grids_image_coord[:, :, :, :, [
            0, -1
        ]][:, :, :, :, :, [0, -1]]  # only the corners
        corner_coordinates = corner_coordinates.detach_()
        corner_coordinates = corner_coordinates.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            8)  # batch_size x label_batch_size x fm_height x fm_width x 8
        corner_coordinates = corner_coordinates.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 5 x fm_height x fm_width

        class_boxes = BoxList(class_boxes_xyxy.view(-1, 4),
                              image_fm_size,
                              mode="xyxy")
        default_boxes_wrt_image = BoxList(default_boxes_xyxy_wrt_image.view(
            -1, 4),
                                          image_fm_size,
                                          mode="xyxy")
        default_boxes_with_image_batches = cat_boxlist(
            [default_boxes_wrt_image] * batch_size * self.class_batch_size)

        output_localization = Os2dBoxCoder.build_loc_targets(
            class_boxes, default_boxes_with_image_batches)  # num_boxes x 4
        output_localization = output_localization.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            4)  # batch_size x label_batch_size x fm_height x fm_width x 4
        output_localization = output_localization.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 4 x fm_height x fm_width

        output_recognition = (matches_summed - 1.0) / 2.0
        output_recognition_transform_detached = (
            matches_summed_transform_detached - 1.0) / 2.0
        return output_localization, output_recognition, output_recognition_transform_detached, corner_coordinates
예제 #4
0
def mine_hard_patches(dataloader, net, cfg, criterion):
    """Mine patches that are hard: classification false positives and negative, localization errors
    At each level of sampled image pyramid, we need to cut out a piece of size appropriate for training
    (levels are defined by cfg.train.mining.num_random_pyramid_scales, cfg.train.mining.num_random_negative_classes)

    Args:
        dataloader - dataloader to use (often the same as the one for training)
        net - the network to use
        cfg - config with all the parameters
        criterion - criterion (usually the same one as used for training)

    Returns:
        hardnegdata_per_imageid (OrderedDict) - mined data, keys are the image ids;
            further used in dataloader.set_hard_negative_data(hardnegdata_per_imageid) when preparing batches
    """
    logger = logging.getLogger("OS2D.mining_hard_patches")
    logger.info("Starting to mine hard patches")
    t_start_mining = time.time()
    net.eval()
    num_batches = len(dataloader)
    hardnegdata_per_imageid = OrderedDict()

    iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger,
                                                                image_batch_size=cfg.eval.batch_size,
                                                                is_cuda=cfg.is_cuda,
                                                                num_random_pyramid_scales=cfg.train.mining.num_random_pyramid_scales,
                                                                num_random_negative_labels=cfg.train.mining.num_random_negative_classes)

    boxes = []
    gt_boxes = []
    losses = OrderedDict()

    # loop over all dataset images
    for data in iterator:
        t_item_start = time.time()

        image_id, image_loc_scores_pyramid, image_class_scores_pyramid, \
                    image_pyramid, query_img_sizes, \
                    batch_class_ids, box_reverse_transform_pyramid, image_fm_sizes_p, transform_corners_pyramid \
                = data

        img_size_pyramid = [FeatureMapSize(img=image) for image in image_pyramid]

        gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id)
        gt_boxes.append(gt_boxes_one_image)

        # compute losses
        # change labels to the ones local to the current image
        dataloader.update_box_labels_to_local(gt_boxes_one_image, batch_class_ids)
        num_labels = len(batch_class_ids)

        loc_targets_pyramid, class_targets_pyramid = \
                dataloader.box_coder.encode_pyramid(gt_boxes_one_image, img_size_pyramid, num_labels,
                                                    default_box_transform_pyramid=box_reverse_transform_pyramid)

        # vizualize GT for debug
        if cfg.visualization.mining.show_gt_boxes:
            visualizer.show_gt_boxes(image_id, gt_boxes_one_image, batch_class_ids, dataloader)

        # compute losses
        if cfg.is_cuda:
            loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid]
            class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid]

        add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors]
        loc_scores_pyramid = add_batch_dim(image_loc_scores_pyramid)
        
        cls_targets_remapped_pyramid = []
        for loc_scores, img_size, box_reverse_transform in zip(loc_scores_pyramid, img_size_pyramid, box_reverse_transform_pyramid):
            # loop over the pyramid levels
            cls_targets_remapped, ious_anchor, ious_anchor_corrected = \
                dataloader.box_coder.remap_anchor_targets(loc_scores, [img_size], query_img_sizes, [gt_boxes_one_image],
                                                          box_reverse_transform=[box_reverse_transform])
            cls_targets_remapped_pyramid.append(cls_targets_remapped)

        losses_iter, losses_per_anchor = criterion(loc_scores_pyramid,
                                                    add_batch_dim(loc_targets_pyramid),
                                                    add_batch_dim(image_class_scores_pyramid),
                                                    add_batch_dim(class_targets_pyramid),
                                                    cls_targets_remapped=cls_targets_remapped_pyramid,
                                                    patch_mining_mode=True)

        if cfg.visualization.mining.show_class_heatmaps:
            visualizer.show_class_heatmaps(image_id, batch_class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid,
                                            cfg_local=cfg.visualization.mining)

        assert dataloader.data_augmentation is not None, "Can mine hard patches only through data augmentation"
        crop_size = dataloader.data_augmentation.random_crop_size

        # convert to floats
        for l in losses_iter:
            losses_iter[l] = losses_iter[l].mean().item()
        # printing
        print_meters(losses_iter, logger)
        # update logs
        add_to_meters_in_dict(losses_iter, losses)

        # construct crop boxes for all the anchors and NMS them - NMS pos ang neg anchors separately
        query_fm_sizes = [dataloader.box_coder._get_feature_map_size_per_image_size(sz) for sz in query_img_sizes]
        
        crops = []
        achors = []
        labels_of_anchors = []
        pyramid_level_of_anchors = []
        losses_of_anchors = []
        corners_of_anchors = []
        losses_loc_of_anchors = []
        pos_mask_of_anchors = []
        pos_loc_mask_of_anchors = []
        neg_mask_of_anchors = []
        anchor_indices = []
        i_image_in_batch = 0 # only one image comes here
        for i_p, img_size in enumerate(img_size_pyramid):
            for i_label, query_fm_size in enumerate(query_fm_sizes):
                crop_position, anchor_position, anchor_index = \
                    dataloader.box_coder.output_box_grid_generator.get_box_to_cut_anchor(img_size,
                                                                                         crop_size,
                                                                                         image_fm_sizes_p[i_p],
                                                                                         box_reverse_transform_pyramid[i_p])
                cur_corners = transform_corners_pyramid[i_p][i_label].transpose(0,1)
                cur_corners = dataloader.box_coder.apply_transform_to_corners(cur_corners, box_reverse_transform_pyramid[i_p], img_size)
                if cfg.is_cuda:
                    crop_position, anchor_position = crop_position.cuda(), anchor_position.cuda()
                crops.append(crop_position)
                achors.append(anchor_position)
                device = crop_position.bbox_xyxy.device
                losses_of_anchors.append(losses_per_anchor["cls_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy))
                pos_mask_of_anchors.append(losses_per_anchor["pos_mask"][i_p][i_image_in_batch, i_label].to(device=device))
                neg_mask_of_anchors.append(losses_per_anchor["neg_mask"][i_p][i_image_in_batch, i_label].to(device=device))
                losses_loc_of_anchors.append(losses_per_anchor["loc_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy))
                pos_loc_mask_of_anchors.append(losses_per_anchor["pos_for_regression"][i_p][i_image_in_batch, i_label].to(device=device))
                corners_of_anchors.append(cur_corners.to(crop_position.bbox_xyxy))

                num_anchors = len(crop_position)
                labels_of_anchors.append(torch.full([num_anchors], i_label, dtype=torch.long))
                pyramid_level_of_anchors.append(torch.full([num_anchors], i_p, dtype=torch.long))
                anchor_indices.append(anchor_index)

        # stack all
        crops = cat_boxlist(crops)
        achors = cat_boxlist(achors)
        labels_of_anchors  = torch.cat(labels_of_anchors, 0)
        pyramid_level_of_anchors = torch.cat(pyramid_level_of_anchors, 0)
        losses_of_anchors = torch.cat(losses_of_anchors, 0)
        losses_loc_of_anchors = torch.cat(losses_loc_of_anchors, 0)
        pos_mask_of_anchors = torch.cat(pos_mask_of_anchors, 0)
        pos_loc_mask_of_anchors = torch.cat(pos_loc_mask_of_anchors, 0)
        neg_mask_of_anchors = torch.cat(neg_mask_of_anchors, 0)
        anchor_indices = torch.cat(anchor_indices, 0)
        corners_of_anchors = torch.cat(corners_of_anchors, 0)

        def nms_masked_and_collect_data(mask, crops_xyxy, scores, nms_iou_threshold_in_mining, max_etries=None):
            mask_ids = torch.nonzero(mask).squeeze(1)
            boxes_selected = copy.deepcopy(crops_xyxy[mask])
            boxes_selected.add_field("scores", scores[mask])
            remaining_boxes = nms(boxes_selected, nms_iou_threshold_in_mining)
            remaining_boxes = mask_ids[remaining_boxes]

            # sort and take the topk, because NMS is not sorting by default
            ids = torch.argsort(scores[remaining_boxes], descending=True)
            if max_etries is not None:
                ids = ids[:max_etries]
            remaining_boxes = remaining_boxes[ids]

            return remaining_boxes

        nms_iou_threshold_in_mining = cfg.train.mining.nms_iou_threshold_in_mining
        num_hard_patches_per_image = cfg.train.mining.num_hard_patches_per_image

        # hard negatives
        hard_negs = nms_masked_and_collect_data(neg_mask_of_anchors, crops, losses_of_anchors,
                                                nms_iou_threshold_in_mining,
                                                num_hard_patches_per_image)

        # hard positives for classification
        hard_pos  = nms_masked_and_collect_data(pos_mask_of_anchors, crops, losses_of_anchors,
                                                nms_iou_threshold_in_mining,
                                                num_hard_patches_per_image)

        # hard positives for localization
        hard_pos_loc  = nms_masked_and_collect_data(pos_loc_mask_of_anchors, crops, losses_loc_of_anchors,
                                                    nms_iou_threshold_in_mining,
                                                    num_hard_patches_per_image)

        # merge all together
        def standardize(v):
            return v.item() if type(v) == torch.Tensor else v
        def add_item(data, role, pyramid_level, label_local, anchor_index, crop_position_xyxy, anchor_position_xyxy, transform_corners):
            new_item = OrderedDict()
            new_item["pyramid_level"] = standardize(pyramid_level)
            new_item["label_local"] = standardize(label_local)
            new_item["anchor_index"] = standardize(anchor_index)
            new_item["role"] = role
            new_item["crop_position_xyxy"] = crop_position_xyxy
            new_item["anchor_position_xyxy"] = anchor_position_xyxy
            new_item["transform_corners"] = transform_corners
            data.append(new_item)

        hardnegdata = []
        for i in hard_negs:
            add_item(hardnegdata, "neg", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())
        for i in hard_pos:
            add_item(hardnegdata, "pos", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())
        for i in hard_pos_loc:
            add_item(hardnegdata, "pos_loc", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())

        # extract loss values and compute the box positions to crop
        for a in hardnegdata:
            a["label_global"] = standardize(batch_class_ids[ a["label_local"] ])
            a["loss"] = standardize(losses_per_anchor["cls_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]])
            a["loss_loc"] = standardize(losses_per_anchor["loc_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]])
            a["score"] = standardize(image_class_scores_pyramid[a["pyramid_level"]][a["label_local"], a["anchor_index"]])
            a["image_id"] = standardize(image_id)

        hardnegdata_per_imageid[image_id] = hardnegdata

        if cfg.visualization.mining.show_mined_patches:
            visualizer.show_mined_patches(image_id, batch_class_ids, dataloader, hardnegdata)

        logger.info("Item time: {0}, since mining start: {1}".format(time_since(t_item_start), time_since(t_start_mining)))
    logger.info("Hard negative mining finished in {0}".format(time_since(t_start_mining)))
    return hardnegdata_per_imageid
def save_cropped_boxes(dataset, tgt_image_path, extension=".jpg", num_random_crops_per_image=0):
    # crop all the boxes
    db = {"cids":[], "cluster":[], "gtbboxid":[], "classid":[], "imageid":[], "difficult":[], "type":[], "size":[], "bbox":[]}

    for image_id in tqdm(dataset.image_ids):
        img = dataset._get_dataset_image_by_id(image_id)
        boxes = dataset.get_image_annotation_for_imageid(image_id)

        assert boxes.has_field("labels"), "GT boxes need a field 'labels'"
        # remove all fields except "labels" and "difficult"
        for f in boxes.fields():
            if f not in ["labels", "difficult"]:
                boxes.remove_field(f)
        if not boxes.has_field("difficult"):
            boxes.add_field("difficult", torch.zeros(len(boxes), dtype=torch.bool))

        num_gt_boxes = len(boxes)
        im_size = FeatureMapSize(img=img)
        assert im_size == boxes.image_size

        eval_scale = dataset.get_eval_scale()

        # sample random boxes if needed
        if num_random_crops_per_image > 0:
            boxes_random = torch.rand(num_random_crops_per_image, 4)
            x1 = torch.min(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w
            x2 = torch.max(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w
            y1 = torch.min(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h
            y2 = torch.max(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h
            boxes_random = torch.stack([x1, y1, x2, y2], 1).floor()

            # crop boxes that are too small
            min_size = 10.0 / eval_scale * max(im_size.w, im_size.h)
            mask_bad_boxes = (boxes_random[:,0] + min_size > boxes_random[:,2]) | (boxes_random[:,1] + min_size > boxes_random[:,3])
            good_boxes = torch.nonzero(~mask_bad_boxes).view(-1)
            boxes_random = boxes_random[good_boxes]

            boxes_random = BoxList(boxes_random, im_size, mode="xyxy")

            boxes_random.add_field("labels", torch.full([len(boxes_random)], -1, dtype=torch.long))
            boxes_random.add_field("difficult", torch.zeros(len(boxes_random), dtype=torch.bool))
            boxes = cat_boxlist([boxes, boxes_random])

        if boxes is not None:
            for i_box in range(len(boxes)):
                # box format: left, top, right, bottom
                box = boxes[i_box].bbox_xyxy.view(-1)
                box = [b.item() for b in box]
                cropped_img = img.crop(box)

                if i_box < num_gt_boxes:
                    lbl = boxes[i_box].get_field("labels").item()
                    dif_flag = boxes[i_box].get_field("difficult").item()
                    box_id = i_box
                    box_type = "GT"
                else:
                    lbl = -1
                    dif_flag = 0
                    box_id = i_box
                    box_type = "RN"

                # create the file name to be used with cirtorch.datasets.datahelpers.cid2filename and their dataloader
                cid = "box{box_id:05d}_lbl{label:05d}_dif{dif:01d}_im{image_id:05d}{box_type}".format(box_id=box_id, image_id = image_id, label = lbl, dif = dif_flag, box_type=box_type)
                file_name = cid2filename(cid, prefix=tgt_image_path)

                # save the image
                image_path, _ = os.path.split(file_name)
                mkdir(image_path)
                if extension:
                    cropped_img.save("{}{}".format(file_name, extension))
                else:
                    # cirtorch uses files with empty extension for training for some reason, need to support that
                    cropped_img.save("{}".format(file_name), format="jpeg")

                # add to the db structure
                db["cids"].append(cid)
                db["cluster"].append(lbl)  # use labels as clusters not to sample negatives from the same object
                db["classid"].append(lbl)
                db["gtbboxid"].append(box_id)
                db["imageid"].append(image_id)
                db["difficult"].append(dif_flag)
                if i_box < num_gt_boxes:
                    db["type"].append("gtproposal")
                else:
                    db["type"].append("randomcrop")
                db["size"].append(cropped_img.size)
                db["bbox"].append(box)  # format (x1,y1,x2,y2)

    return db
예제 #6
0
    def evaluate_detections(self,
                            all_boxes,
                            output_dir,
                            mAP_iou_threshold=0.5):
        predictions = []
        gt_boxes = []
        roidb = self.roidb
        for i_image, roi in enumerate(roidb):
            image_size = FeatureMapSize(w=roi["width"], h=roi["height"])
            if roi["boxes"].size > 0:
                roi_gt_boxes = BoxList(roi["boxes"], image_size, mode="xyxy")
            else:
                roi_gt_boxes = BoxList.create_empty(image_size)
            roi_gt_boxes.add_field(
                "labels", torch.as_tensor(roi["gt_classes"],
                                          dtype=torch.int32))
            roi_gt_boxes.add_field(
                "difficult",
                torch.as_tensor(roi["gt_ishard"], dtype=torch.int32))

            gt_boxes.append(roi_gt_boxes)

            roi_detections = []
            for i_class, class_boxes in enumerate(all_boxes):
                assert len(class_boxes) == len(roidb), \
                    "Number of detection for class {0} image{1} ({2}) inconsistent with the length of roidb ({3})".format(i_class, i_image, len(class_boxes), len(roidb))
                boxes = class_boxes[i_image]
                if len(boxes) > 0:
                    assert boxes.shape[
                        1] == 5, "Detections should be of shape (:,5), but are {0} for class {1}, image {2}".format(
                            boxes.shape, i_class, i_image)
                    bbox = BoxList(boxes[:, :4], image_size, mode="xyxy")
                    scores = boxes[:, -1]
                    bbox.add_field(
                        "scores", torch.as_tensor(scores, dtype=torch.float32))
                    bbox.add_field(
                        "labels",
                        torch.full(scores.shape, i_class, dtype=torch.int32))
                    roi_detections.append(bbox)

            if roi_detections:
                roi_detections = cat_boxlist(roi_detections)
            else:
                roi_detections = BoxList.create_empty(image_size)
                roi_detections.add_field(
                    "scores", torch.zeros((0, ), dtype=torch.float32))
                roi_detections.add_field("labels",
                                         torch.zeros((0, ), dtype=torch.int32))
            predictions.append(roi_detections)

            if False:
                self.visualize_detections(i_image,
                                          gt=roi_gt_boxes,
                                          dets=roi_detections)

        ap_data = do_voc_evaluation(predictions,
                                    gt_boxes,
                                    iou_thresh=mAP_iou_threshold,
                                    use_07_metric=False)
        print("mAP@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map"]))
        print("mAPw@{:0.2f}: {:0.4f}".format(mAP_iou_threshold,
                                             ap_data["map_weighted"]))
        print("recall@{:0.2f}: {:0.4f}".format(mAP_iou_threshold,
                                               ap_data["recall"]))

        return ap_data['map']