示例#1
0
    def detr_probabilistic_inference(self, input_im):

        outputs = self.model(input_im,
                             return_raw_results=True,
                             is_mc_dropout=self.mc_dropout_enabled)

        image_width = input_im[0]['image'].shape[2]
        image_height = input_im[0]['image'].shape[1]

        # Handle logits and classes
        predicted_logits = outputs['pred_logits'][0]
        if 'pred_logits_var' in outputs.keys():
            predicted_logits_var = outputs['pred_logits_var'][0]
            box_cls_dists = torch.distributions.normal.Normal(
                predicted_logits,
                scale=torch.sqrt(torch.exp(predicted_logits_var)))
            predicted_logits = box_cls_dists.rsample(
                (self.model.cls_var_num_samples, ))
            predicted_prob_vectors = F.softmax(predicted_logits, dim=-1)
            predicted_prob_vectors = predicted_prob_vectors.mean(0)
        else:
            predicted_prob_vectors = F.softmax(predicted_logits, dim=-1)

        predicted_prob, classes_idxs = predicted_prob_vectors[:, :-1].max(-1)
        # Handle boxes and covariance matrices
        predicted_boxes = outputs['pred_boxes'][0]

        # Rescale boxes to inference image size (not COCO original size)
        pred_boxes = Boxes(box_cxcywh_to_xyxy(predicted_boxes))
        pred_boxes.scale(scale_x=image_width, scale_y=image_height)
        predicted_boxes = pred_boxes.tensor

        # Rescale boxes to inference image size (not COCO original size)
        if 'pred_boxes_cov' in outputs.keys():
            predicted_boxes_covariance = covariance_output_to_cholesky(
                outputs['pred_boxes_cov'][0])
            predicted_boxes_covariance = torch.matmul(
                predicted_boxes_covariance,
                predicted_boxes_covariance.transpose(1, 2))

            transform_mat = torch.tensor([[[1.0, 0.0, -0.5, 0.0],
                                           [0.0, 1.0, 0.0, -0.5],
                                           [1.0, 0.0, 0.5, 0.0],
                                           [0.0, 1.0, 0.0,
                                            0.5]]]).to(self.model.device)
            predicted_boxes_covariance = torch.matmul(
                torch.matmul(transform_mat, predicted_boxes_covariance),
                transform_mat.transpose(1, 2))

            scale_mat = torch.diag_embed(
                torch.as_tensor(
                    (image_width, image_height, image_width, image_height),
                    dtype=torch.float32)).to(self.model.device).unsqueeze(0)
            predicted_boxes_covariance = torch.matmul(
                torch.matmul(scale_mat, predicted_boxes_covariance),
                torch.transpose(scale_mat, 2, 1))
        else:
            predicted_boxes_covariance = []

        return predicted_boxes, predicted_boxes_covariance, predicted_prob, classes_idxs, predicted_prob_vectors
    def predict_boxes(self, images, boxes):
        assert images.shape[0] == 1
        img = images
        img = self.to_numpy(img)[:, :, ::-1]  # to BGR
        original_size = img.shape[:2]
        try:
            img, proposals = self._transform_image_and_boxes(img, boxes)
        except BaseException as e:
            print(e)
            import ipdb; ipdb.set_trace()

        device = self.model.device
        img = img.to(device)
        proposals = [proposals.to(device)]
        inputs = [{
            'image': img,
            'proposals': proposals,
            'height': original_size[0],
            'width': original_size[1]
        }]

        model = self.model
        roi_heads = self.model.roi_heads

        images = model.preprocess_image(inputs)
        features = model.backbone(images.tensor)
        features = [features[f] for f in roi_heads.in_features]
        box_features = roi_heads.box_pooler(
            features, [x.proposal_boxes for x in proposals])
        box_features = roi_heads.box_head(box_features)
        pred_class_logits, pred_proposal_deltas = roi_heads.box_predictor(
            box_features)

        outputs = FastRCNNOutputs(
            roi_heads.box2box_transform,
            pred_class_logits,
            pred_proposal_deltas,
            proposals,
            roi_heads.smooth_l1_beta,
        )
        pred_boxes = outputs.predict_boxes()[0]

        c = self.person_class

        if self.softmax_only_person:
            scores = pred_class_logits[:, [c, -1]].detach()
            scores = F.softmax(scores, -1)[:, 0]
        else:
            scores = F.softmax(pred_class_logits, -1)
            scores = scores[:, c].detach()
        boxes = pred_boxes[:, c * 4:(c + 1) * 4].detach()

        scale_y = original_size[0] / img.shape[1]
        scale_x = original_size[1] / img.shape[2]
        boxes = Boxes(boxes)
        boxes.scale(scale_x, scale_y)
        boxes = boxes.tensor
        return boxes, scores
示例#3
0
    def regress_and_classify(self, image: np.ndarray, tracklets: List[Tracklet]) -> Tuple[np.ndarray, np.ndarray]:
        # Convert boxes to proposals
        height, width = image.shape[:2]
        image = self.transform_gen.get_transform(image).apply_image(image)
        image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
        # Size of feature maps, used in the detector
        feat_height, feat_width = image.shape[1:3]
        scale_x = feat_width / width
        scale_y = feat_height / height
        proposal_boxes = Boxes(torch.tensor([tracklet.last_detection.box for tracklet in tracklets]))

        # Scale proposals to the same size as boxes
        proposal_boxes.scale(scale_x, scale_y)
        proposals = Instances((feat_height, feat_width), proposal_boxes=proposal_boxes)

        inputs = {"image": image, "height": height, "width": width, "proposals": proposals}

        images = self.model.preprocess_image([inputs])
        features = self.model.backbone(images.tensor)
        proposals = [inputs["proposals"].to(self.model.device)]

        # Extract features, perform RoI pooling and perform regression/classification for each RoI
        features_list = [features[f] for f in self.model.roi_heads.in_features]

        box_features = self.model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
        box_features = self.model.roi_heads.box_head(box_features)
        pred_class_logits, pred_proposal_deltas = self.model.roi_heads.box_predictor(box_features)
        del box_features

        raw_outputs = FastRCNNOutputs(
            self.model.roi_heads.box_predictor.box2box_transform,
            pred_class_logits,
            pred_proposal_deltas,
            proposals,
            self.model.roi_heads.box_predictor.smooth_l1_beta,
        )

        # Convert raw outputs to predicted boxes and scores
        boxes = raw_outputs.predict_boxes()[0]
        scores = raw_outputs.predict_probs()[0]

        num_bbox_reg_classes = boxes.shape[1] // 4
        boxes = Boxes(boxes.reshape(-1, 4))
        # Scale regressed boxes to the same size as original image
        boxes.clip((feat_height, feat_width))
        boxes.scale(1 / scale_x, 1 / scale_y)
        boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)
        boxes = boxes[:, 0, :]
        scores = scores[:, 0]

        pred_boxes = boxes.detach().cpu().numpy()
        scores = scores.detach().cpu().numpy()
        return pred_boxes, scores
示例#4
0
    def add_pseudo_label(self, targets, image_path, flip):
        new_targets = []
        if self.pseudo_gt is None:
            return targets
        if len(targets) > 0 and targets[
                0].gt_boxes.tensor.device != self.pseudo_gt.device:
            self.pseudo_gt = self.pseudo_gt.to(
                targets[0].gt_boxes.tensor.device)
        for i, (targets_per_image, path) in enumerate(zip(targets,
                                                          image_path)):
            H, W = targets_per_image._image_size
            gt_boxes = targets_per_image.gt_boxes
            gt_classes = targets_per_image.gt_classes
            p = int(path.split('/')[-1].split('.')[0])
            data = self.pseudo_gt[self.pseudo_gt[:, 0] == p]
            ld = len(data)
            if len(data) == 0:
                new_targets.append(targets_per_image)
                continue
            label = data[:, 1].long()
            boxes = data[:, 2:].clone()
            if flip[i] == 1:
                boxes[:, 0] = 1 - boxes[:, 0]
                boxes[:, 2] = 1 - boxes[:, 2]
                boxes = torch.index_select(
                    boxes, -1,
                    torch.as_tensor([2, 1, 0, 3], device=boxes.device))
            boxes = Boxes(boxes)
            boxes.scale(scale_x=W, scale_y=H)
            new_gt_boxes = gt_boxes.cat([gt_boxes, boxes])

            new_gt_masks = PolygonMasks([[]])
            if hasattr(targets_per_image, 'gt_masks'):
                gt_masks = targets_per_image.gt_masks
                new_gt_masks = new_gt_masks.cat([gt_masks] +
                                                [new_gt_masks] * ld)
            else:
                new_gt_masks = new_gt_masks.cat([new_gt_masks] * ld)
            new_gt_classes = torch.cat((gt_classes, label))

            new_target = Instances((H, W))
            new_target.gt_classes = new_gt_classes
            new_target.gt_masks = new_gt_masks
            new_target.gt_boxes = new_gt_boxes
            new_targets.append(new_target)
            lbl, cnt = label.unique(return_counts=True)
        return new_targets
示例#5
0
def seg_det_postprocess_bk(segmap, contour, emb, img_size, output_height,
                           output_width):
    """
    Translate segmentation predictions into detection results.

    The input images are often resized when entering semantic segmentor. Moreover, in same
    cases, they also padded inside segmentor to be divisible by maximum network stride.
    As a result, we often need the predictions of the segmentor in a different
    resolution from its inputs.

    Args:
        segmap (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        contour (Tensor): contour prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        emb (Tensor): contour prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        img_size (tuple): image size that segmentor is taking as input.
        output_height, output_width: the desired output resolution.

    Returns:
        semantic segmentation prediction (Tensor): A tensor of the shape
            (C, output_height, output_width) that contains per-pixel soft predictions.
    """
    segmap = segmap[:, :img_size[0], :img_size[1]].cpu().numpy()
    contour = contour[:, :img_size[0], :img_size[1]].cpu().numpy()
    emb = emb[:, :img_size[0], :img_size[1]].cpu().numpy()

    ncls = segmap.shape[0] - 1  # remove the background
    assert (contour.shape[0] == ncls)
    assert (emb.shape[0] == ncls)
    H = segmap.shape[1]
    W = segmap.shape[2]

    pred_boxes = []
    pred_scores = []
    pred_classes = []
    pred_masks = []
    # Step1: segment the foreground (according to segmap) into super-pixels (separated by contours)
    for c in range(ncls):
        cont_c = contour[c]
        seg_c = segmap[c]
        emb_c = emb[c]
        #TODO: we may need to turn the contour map and segmap into binary images
        #For now we combine contour map and segmentation before connecting superpixels for simplicity
        bw = (1 - cont_c) * seg_c > 0.2  # 0.05 # 1-cont_c > 0.7  #
        retval, labels, stats, centroids = cv2.connectedComponentsWithStats(
            bw.astype(np.uint8))
        nseg = retval  # np.max(labels)
        # Note: the background is labeled to b 0, which should be ignored
        #         assert(retval==nseg+1)
        avg_embed = np.zeros(nseg)
        avg_scores = np.zeros(nseg)
        bboxes = np.zeros((nseg, 4))
        for s in range(nseg):
            seg_size = stats[s, cv2.CC_STAT_AREA]
            if seg_size < H * W * 0.0001:  # 0.008 0.0002
                continue

            # calculate the average embedding of each superpixel
            superpixel = labels == s
            superpixel = superpixel.astype(np.float)
            npixel = np.sum(superpixel)
            avg_scores[s] = np.sum(seg_c * superpixel) / npixel
            avg_embed[s] = np.sum(emb_c * superpixel) / npixel

            # get the bounding boxes of superpixels in X1Y1X2Y2 format
            bboxes[s, 0] = stats[s, cv2.CC_STAT_LEFT]
            bboxes[s, 1] = stats[s, cv2.CC_STAT_TOP]
            bboxes[s, 2] = stats[s, cv2.CC_STAT_WIDTH] + bboxes[s, 0]
            bboxes[s, 3] = stats[s, cv2.CC_STAT_HEIGHT] + bboxes[s, 1]

        # remove small segments and low-confident segments
        idx = [s for s in range(nseg) if avg_scores[s] >= 0.2]  # 0.2
        avg_embed = avg_embed[idx]
        avg_scores = avg_scores[idx]
        bboxes = bboxes[idx, :]
        nseg = len(avg_scores)

        # Step 2: group the super-pixels of the same object according to embedding
        bmerged = np.zeros(nseg, dtype=np.bool)
        merged_bboxes = bboxes
        areas = np.zeros(nseg, dtype=np.float)
        for s in range(nseg):
            if bmerged[s]:
                continue
            areas[s] = (merged_bboxes[s, 3] - merged_bboxes[s, 1]) * (
                merged_bboxes[s, 2] - merged_bboxes[s, 0])
            for t in range(s + 1, nseg):
                # TODO: we may take spatial distance as an auxiliary criteria
                if abs(avg_embed[s] - avg_embed[t]) < 0.2:  # 0.2
                    #                     idx = np.where(labels==t)
                    #                     labels[idx] = s

                    # merge the bounding boxes
                    merged_bboxes[s, 0] = min(merged_bboxes[s, 0],
                                              merged_bboxes[t, 0])
                    merged_bboxes[s, 1] = min(merged_bboxes[s, 1],
                                              merged_bboxes[t, 1])
                    merged_bboxes[s, 2] = max(merged_bboxes[s, 2],
                                              merged_bboxes[t, 2])
                    merged_bboxes[s, 3] = max(merged_bboxes[s, 3],
                                              merged_bboxes[t, 3])

                    areas[s] = (merged_bboxes[s, 3] - merged_bboxes[s, 1]) * (
                        merged_bboxes[s, 2] - merged_bboxes[s, 0])

                    # merge the scores
                    avg_scores[s] = max(avg_scores[s], avg_scores[t])

                    bmerged[t] = True

        ileft = [
            s for s in range(nseg)
            if not bmerged[s] and areas[s] > H * W * 0.002
        ]
        avg_scores = avg_scores[ileft]
        bboxes = merged_bboxes[ileft, :].astype(np.int32)
        nseg = len(avg_scores)
        masks = []
        for i in range(nseg):
            mask = np.zeros(img_size, dtype=np.float)
            mask[bboxes[i, 0]:bboxes[i, 2],
                 bboxes[i, 1]:bboxes[i, 3]] = seg_c[bboxes[i, 0]:bboxes[i, 2],
                                                    bboxes[i, 1]:bboxes[i, 3]]
            masks.append(mask)

        pred_boxes.append(bboxes)
        pred_scores.append(avg_scores)
        pred_classes += [c] * len(avg_scores)
        pred_masks += masks

    # rescale the bounding boxes to match the output resolution
    scale_x, scale_y = (output_width / img_size[1],
                        output_height / img_size[0])
    result = Instances((output_height, output_width))  # img_size
    output_boxes = Boxes(torch.tensor(np.concatenate(pred_boxes).astype(int)))
    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(result.image_size)
    result.pred_boxes = output_boxes
    #     output_boxes.clip(results.image_size)
    result.scores = torch.tensor(np.concatenate(pred_scores))
    result.pred_classes = torch.tensor(pred_classes)
    #     result.pred_masks = torch.tensor(np.concatenate(pred_masks))  #TODO: we have to rescale to the output size

    return result
示例#6
0
def seg_det_postprocess(segmap, contour, emb, img_size, output_height,
                        output_width):
    """
    Translate segmentation predictions into detection results.

    The input images are often resized when entering semantic segmentor. Moreover, in same
    cases, they also padded inside segmentor to be divisible by maximum network stride.
    As a result, we often need the predictions of the segmentor in a different
    resolution from its inputs.

    Args:
        segmap (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        contour (Tensor): contour prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        emb (Tensor): contour prediction logits. A tensor of shape (C, H, W),
            where C is the number of classes, and H, W are the height and width of the prediction.
        img_size (tuple): image size that segmentor is taking as input.
        output_height, output_width: the desired output resolution.

    Returns:
        semantic segmentation prediction (Tensor): A tensor of the shape
            (C, output_height, output_width) that contains per-pixel soft predictions.
    """
    segmap = segmap[:, :img_size[0], :img_size[1]].cpu().numpy()
    contour = contour[:, :img_size[0], :img_size[1]].cpu().numpy()
    emb = emb[:, :img_size[0], :img_size[1]].cpu().numpy()

    ncls = segmap.shape[0] - 1  # remove the background
    assert (contour.shape[0] == ncls)
    assert (emb.shape[0] == ncls)
    H = segmap.shape[1]
    W = segmap.shape[2]

    pred_boxes = []
    pred_scores = []
    pred_classes = []
    pred_masks = []
    # Step1: segment the foreground (according to segmap) into super-pixels (separated by contours)
    for c in range(ncls):
        cont_c = contour[c]
        seg_c = segmap[c]
        emb_c = emb[c]
        #TODO: we may need to turn the contour map and segmap into binary images
        #For now we combine contour map and segmentation before connecting superpixels for simplicity
        bw = (1 - 1.5 * cont_c) * seg_c > 0.2  # 0.05 # 1-cont_c > 0.5   #
        retval, labels, stats, centroids = cv2.connectedComponentsWithStats(
            bw.astype(np.uint8))
        nseg = np.max(labels)
        # Note: the background is labeled to b 0, which should be ignored
        assert (retval == nseg + 1)
        avg_embed = np.zeros(nseg)
        avg_scores = np.zeros(nseg)
        bboxes = np.zeros((nseg, 4))
        bboxes_size = np.zeros(nseg)
        sizes = np.zeros(nseg)
        for s in range(nseg):
            sizes[s] = stats[s + 1, cv2.CC_STAT_AREA]
            if sizes[s] < H * W * 0.0001:
                continue

            # calculate the average embedding of each superpixel
            superpixel = labels == s + 1
            superpixel = superpixel.astype(np.float)
            npixel = np.sum(superpixel)
            avg_scores[s] = np.sum(seg_c * superpixel) / npixel
            #     avg_embed[s] = np.sum(emb_c * superpixel) / npixel

            # calculate the median value of the segment
            ipixels = np.nonzero(labels == s + 1)
            avg_embed[s] = np.median(emb_c[ipixels])
            #     print(avg_embed[s] - median_embed)

            # get the bounding boxes of superpixels in X1Y1X2Y2 format
            bboxes[s, 0] = stats[s + 1, cv2.CC_STAT_LEFT]
            bboxes[s, 1] = stats[s + 1, cv2.CC_STAT_TOP]
            bboxes[s, 2] = stats[s + 1, cv2.CC_STAT_WIDTH] + bboxes[s, 0]
            bboxes[s, 3] = stats[s + 1, cv2.CC_STAT_HEIGHT] + bboxes[s, 1]
            bboxes_size[s] = (bboxes[s, 3] - bboxes[s, 1]) * (bboxes[s, 2] -
                                                              bboxes[s, 0])

        # Step 2: remove low-confident segments
        idx = [s for s in range(nseg) if avg_scores[s] >= 0.2]  # 0.05
        avg_embed = avg_embed[idx]
        avg_scores = avg_scores[idx]
        bboxes = bboxes[idx, :]
        bboxes_size = bboxes_size[idx]
        sizes = sizes[idx]
        nseg = len(avg_scores)

        # Step 3: Sort the segments by size
        sorted_idx = np.flip(np.argsort(sizes))
        avg_embed = avg_embed[sorted_idx]
        avg_scores = avg_scores[sorted_idx]
        bboxes = bboxes[sorted_idx, :]
        bboxes_size = bboxes_size[sorted_idx]
        sizes = sizes[sorted_idx]

        # Step 4: calculate the similarity between each pair of segments
        sim = np.zeros((nseg, nseg))
        if nseg >= 2:
            emb_sigma = avg_embed.std()
        else:
            emb_sigma = 0.5

        SIM_EMB_FACTOR = 0.8  # 1.5 5*(avg_embed.max()-avg_embed.min())/nseg   # 1.0/(emb_sigma*np.sqrt(2*np.pi))
        for s in range(nseg):
            for t in range(s + 1, nseg):
                # similarity of embedding
                #sim_emb = np.exp(-SIM_EMB_FACTOR * np.abs(avg_embed[s]-avg_embed[t])/emb_var)
                #sim_emb = SIM_EMB_FACTOR * np.exp(-np.math.pow(avg_embed[s]-avg_embed[t], 2)/(2*2*emb_sigma**2))
                sim_emb = np.exp(-np.abs(avg_embed[s] - avg_embed[t]) /
                                 SIM_EMB_FACTOR)
                #         sim_emb = np.exp(-4*np.abs(avg_embed[s]-avg_embed[t])/(np.abs(avg_embed[s])+np.abs(avg_embed[t])))

                # spatial distance based on GIOU
                merged_bbox = np.zeros(4)
                merged_bbox[0] = min(bboxes[s, 0], bboxes[t, 0])
                merged_bbox[1] = min(bboxes[s, 1], bboxes[t, 1])
                merged_bbox[2] = max(bboxes[s, 2], bboxes[t, 2])
                merged_bbox[3] = max(bboxes[s, 3], bboxes[t, 3])

                merged_area = (merged_bbox[3] - merged_bbox[1]) * (
                    merged_bbox[2] - merged_bbox[0])

                overlap_bbox = np.zeros(4)
                overlap_bbox[0] = max(bboxes[s, 0], bboxes[t, 0])
                overlap_bbox[1] = max(bboxes[s, 1], bboxes[t, 1])
                overlap_bbox[2] = min(bboxes[s, 2], bboxes[t, 2])
                overlap_bbox[3] = min(bboxes[s, 3], bboxes[t, 3])

                overlap_area = max(0, overlap_bbox[2] - overlap_bbox[0]) * max(
                    0, overlap_bbox[3] - overlap_bbox[1])

                sim_spatial = (bboxes_size[s] + bboxes_size[t] -
                               overlap_area) / merged_area

                #TODO: calculate contour-based distance

                sim[s, t] = sim_spatial * sim_emb  #
                sim[t, s] = sim[s, t]

        #TODO: calculate the keypoint-based similarity

        # Step 5: group the segments of the same object according to the similarity matrix
        bmerged = np.zeros(nseg, dtype=np.bool)
        group_IDs = np.ones(nseg, dtype=np.int) * -1
        ngroups = 0
        THR_SIM = 0.5
        while any(group_IDs < 0):
            for s in range(nseg):
                if group_IDs[s] < 0:
                    # find out the closest segment
                    assigned = np.nonzero(group_IDs >= 0)
                    if assigned[0].size == 0:
                        group_IDs[s] = ngroups
                        ngroups += 1
                    else:
                        sim_group = sim[s, assigned[0]]
                        t = np.argmax(sim_group)
                        #                 print(sim_group)
                        #                 print(t)
                        if sim_group[t] > THR_SIM:
                            group_IDs[s] = group_IDs[assigned[0][t]]
                        else:
                            group_IDs[s] = ngroups
                            ngroups += 1

        # merge the groups
        group_bboxes = np.zeros((ngroups, 4))
        group_scores = np.zeros(ngroups)
        group_areas = np.zeros(ngroups)
        for g in range(ngroups):
            assigned = np.nonzero(group_IDs == g)
            assigned = assigned[0]
            group_bboxes[g, :] = bboxes[assigned[0], :]
            group_scores[g] = avg_scores[assigned[0]]
            group_areas[g] = sizes[assigned[0]]
            for s in range(1, len(assigned)):
                # merge the bounding boxes
                group_bboxes[g, 0] = min(bboxes[assigned[s], 0],
                                         group_bboxes[g, 0])
                group_bboxes[g, 1] = min(bboxes[assigned[s], 1],
                                         group_bboxes[g, 1])
                group_bboxes[g, 2] = max(bboxes[assigned[s], 2],
                                         group_bboxes[g, 2])
                group_bboxes[g, 3] = max(bboxes[assigned[s], 3],
                                         group_bboxes[g, 3])

                # areas[s] = (merged_bboxes[s, 3]-merged_bboxes[s, 1])*(merged_bboxes[s, 2]-merged_bboxes[s, 0])
                group_areas[g] += sizes[assigned[s]]

#                 # merge the scores # use the score of the large segment
#                 group_scores[g] = max(avg_scores[assigned[s]], group_scores[g])

        if nseg:
            THR_AREA = max(sizes[0] * 0.1, H * W * 0.001)
        else:
            THR_AREA = H * W * 0.001  # 0.002
        ileft = np.nonzero(group_areas > THR_AREA)
        ileft = ileft[0]
        avg_scores = group_scores[ileft]
        bboxes = group_bboxes[ileft, :].astype(np.int32)
        nseg = len(avg_scores)
        masks = []
        for i in range(nseg):
            mask = np.zeros((H, W), dtype=np.float)
            mask[bboxes[i, 0]:bboxes[i, 2],
                 bboxes[i, 1]:bboxes[i, 3]] = seg_c[bboxes[i, 0]:bboxes[i, 2],
                                                    bboxes[i, 1]:bboxes[i, 3]]
            masks.append(mask)

        pred_boxes.append(bboxes)
        pred_scores.append(avg_scores)
        pred_classes += [c] * len(avg_scores)
        pred_masks += masks

    # rescale the bounding boxes to match the output resolution
    scale_x, scale_y = (output_width / img_size[1],
                        output_height / img_size[0])
    result = Instances((output_height, output_width))  # img_size
    output_boxes = Boxes(torch.tensor(np.concatenate(pred_boxes).astype(int)))
    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(result.image_size)
    result.pred_boxes = output_boxes
    #     output_boxes.clip(results.image_size)
    result.scores = torch.tensor(np.concatenate(pred_scores))
    result.pred_classes = torch.tensor(pred_classes)
    #     result.pred_masks = torch.tensor(np.concatenate(pred_masks))  #TODO: we have to rescale to the output size

    return result