예제 #1
0
    def test_get_bounding_box(self):
        masks = torch.tensor([
            [
                [False, False, False, True],
                [False, False, True, True],
                [False, True, True, False],
                [False, True, True, False],
            ],
            [
                [False, False, False, False],
                [False, False, True, False],
                [False, True, True, False],
                [False, True, True, False],
            ],
            torch.zeros(4, 4),
        ])
        bitmask = BitMasks(masks)
        box_true = torch.tensor([[1, 0, 4, 4], [1, 1, 3, 4], [0, 0, 0, 0]],
                                dtype=torch.float32)
        box = bitmask.get_bounding_boxes()
        self.assertTrue(torch.all(box.tensor == box_true).item())

        for box in box_true:
            poly = box[[0, 1, 2, 1, 2, 3, 0, 3]].numpy()
            mask = polygons_to_bitmask([poly], 4, 4)
            reconstruct_box = BitMasks(
                mask[None, :, :]).get_bounding_boxes()[0].tensor
            self.assertTrue(torch.all(box == reconstruct_box).item())

            reconstruct_box = PolygonMasks([[poly]
                                            ]).get_bounding_boxes()[0].tensor
            self.assertTrue(torch.all(box == reconstruct_box).item())
예제 #2
0
    def add_bitmasks(self, instances, im_h, im_w):
        for per_im_gt_inst in instances:
            if not per_im_gt_inst.has("gt_masks"):
                continue
            start = int(self.mask_out_stride // 2)
            if isinstance(per_im_gt_inst.get("gt_masks"), PolygonMasks):
                polygons = per_im_gt_inst.get("gt_masks").polygons
                per_im_bitmasks = []
                per_im_bitmasks_full = []
                for per_polygons in polygons:
                    bitmask = polygons_to_bitmask(per_polygons, im_h, im_w)
                    bitmask = torch.from_numpy(bitmask).to(self.device).float()
                    start = int(self.mask_out_stride // 2)
                    bitmask_full = bitmask.clone()
                    bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride]

                    assert bitmask.size(0) * self.mask_out_stride == im_h
                    assert bitmask.size(1) * self.mask_out_stride == im_w

                    per_im_bitmasks.append(bitmask)
                    per_im_bitmasks_full.append(bitmask_full)

                per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
                per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
            else: # RLE format bitmask
                bitmasks = per_im_gt_inst.get("gt_masks").tensor
                h, w = bitmasks.size()[1:]
                # pad to new size
                bitmasks_full = F.pad(bitmasks, (0, im_w - w, 0, im_h - h), "constant", 0)
                bitmasks = bitmasks_full[:, start::self.mask_out_stride, start::self.mask_out_stride]
                per_im_gt_inst.gt_bitmasks = bitmasks
                per_im_gt_inst.gt_bitmasks_full = bitmasks_full
예제 #3
0
    def process_annotation(self, ann, mask_side_len=28):
        # Parse annotation data
        img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0]
        height, width = img_info["height"], img_info["width"]
        gt_polygons = [
            np.array(p, dtype=np.float64) for p in ann["segmentation"]
        ]
        gt_bbox = BoxMode.convert(ann["bbox"], BoxMode.XYWH_ABS,
                                  BoxMode.XYXY_ABS)
        gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width)

        # Run rasterize ..
        torch_gt_bbox = torch.tensor(gt_bbox).to(dtype=torch.float32).reshape(
            -1, 4)
        box_bitmasks = {
            "polygon":
            PolygonMasks([gt_polygons
                          ]).crop_and_resize(torch_gt_bbox, mask_side_len)[0],
            "gridsample":
            rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox,
                                                mask_side_len),
            "roialign":
            BitMasks(torch.from_numpy(
                gt_bit_mask[None, :, :])).crop_and_resize(
                    torch_gt_bbox, mask_side_len)[0],
        }

        # Run paste ..
        results = defaultdict(dict)
        for k, box_bitmask in box_bitmasks.items():
            padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1)
            scaled_boxes = scale_boxes(torch_gt_bbox, scale)

            r = results[k]
            r["old"] = paste_mask_in_image_old(padded_bitmask[0],
                                               scaled_boxes[0],
                                               height,
                                               width,
                                               threshold=0.5)
            r["aligned"] = paste_masks_in_image(box_bitmask[None, :, :],
                                                Boxes(torch_gt_bbox),
                                                (height, width))[0]

        table = []
        for rasterize_method, r in results.items():
            for paste_method, mask in r.items():
                mask = np.asarray(mask)
                iou = iou_between_full_image_bit_masks(
                    gt_bit_mask.astype("uint8"), mask)
                table.append((rasterize_method, paste_method, iou))
        return table
예제 #4
0
def get_single_image_pcd(plane_params, segmentations, height=480, width=640):
    plane_params = np.array(plane_params)
    offsets = np.maximum(np.linalg.norm(plane_params, ord=2, axis=1), 1e-5)
    norms = plane_params / offsets.reshape(-1, 1)

    if type(segmentations[0]) == dict:
        poly_segmentations = rle2polygon(segmentations)
    else:
        poly_segmentations = segmentations
    verts_list = []

    for segm, normal, offset in zip(poly_segmentations, norms, offsets):
        if len(segm) == 0:
            verts_list.append(torch.tensor([[0, 0, 0]], dtype=torch.float32))
            continue
        verts_3d = []
        bitmask = polygons_to_bitmask(segm, height=height, width=width)
        verts = np.transpose(bitmask.nonzero())
        verts_3d = get_pcd(verts[:, ::-1], normal, offset)
        verts_list.append(torch.tensor(verts_3d, dtype=torch.float32))
    return verts_list
예제 #5
0
파일: condinst.py 프로젝트: zhubinQAQ/Ins
    def add_bitmasks(self, instances, im_h, im_w):
        for per_im_gt_inst in instances:
            if not per_im_gt_inst.has("gt_masks"):
                continue
            polygons = per_im_gt_inst.get("gt_masks").polygons
            per_im_bitmasks = []
            per_im_bitmasks_full = []
            for per_polygons in polygons:
                bitmask = polygons_to_bitmask(per_polygons, im_h, im_w)
                bitmask = torch.from_numpy(bitmask).to(self.device).float()
                start = int(self.mask_out_stride // 2)
                bitmask_full = bitmask.clone()
                bitmask = bitmask[start::self.mask_out_stride,
                                  start::self.mask_out_stride]

                assert bitmask.size(0) * self.mask_out_stride == im_h
                assert bitmask.size(1) * self.mask_out_stride == im_w

                per_im_bitmasks.append(bitmask)
                per_im_bitmasks_full.append(bitmask_full)

            per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
            per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full,
                                                          dim=0)
예제 #6
0
def annotations_to_instances(annos, image_size, mask_format="polygon"):
    """
    Create an :class:`Instances` object used by the models,
    from instance annotations in the dataset dict.

    Args:
        annos (list[dict]): a list of instance annotations in one image, each
            element for one instance.
        image_size (tuple): height, width

    Returns:
        Instances:
            It will contain fields "gt_boxes", "gt_classes",
            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
            This is the format that builtin models expect.
    """
    boxes = [
        BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
        for obj in annos
    ]
    target = Instances(image_size)
    boxes = target.gt_boxes = Boxes(boxes)
    boxes.clip(image_size)

    classes = [obj["category_id"] for obj in annos]
    classes = torch.tensor(classes, dtype=torch.int64)
    target.gt_classes = classes

    if len(annos) and "segmentation" in annos[0]:
        segms = [obj["segmentation"] for obj in annos]
        if mask_format == "polygon":
            masks = PolygonMasks(segms)
        # Old version
        # else:
        #     assert mask_format == "bitmask", mask_format
        #     masks = BitMasks.from_polygon_masks(polygons, *image_size)
        else:
            assert mask_format == "bitmask", mask_format
            masks = []
            for segm in segms:
                if isinstance(segm, list):
                    # polygon
                    masks.append(polygons_to_bitmask(segm, *image_size))
                elif isinstance(segm, dict):
                    # COCO RLE
                    masks.append(mask_util.decode(segm))
                elif isinstance(segm, np.ndarray):
                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
                        segm.ndim)
                    # mask array
                    masks.append(segm)
                else:
                    raise ValueError(
                        "Cannot convert segmentation of type '{}' to BitMasks!"
                        "Supported types are: polygons as list[list[float] or ndarray],"
                        " COCO-style RLE as a dict, or a full-image segmentation mask "
                        "as a 2D ndarray.".format(type(segm)))
            # torch.from_numpy does not support array with negative stride.
            masks = BitMasks(
                torch.stack([
                    torch.from_numpy(np.ascontiguousarray(x)) for x in masks
                ]))
        target.gt_masks = masks

    if len(annos) and "keypoints" in annos[0]:
        kpts = [obj.get("keypoints", []) for obj in annos]
        target.gt_keypoints = Keypoints(kpts)

    return target
예제 #7
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
                Each item in the list contains the inputs for one image.

                For now, each item in the list is a dict that contains:

                * "image": Tensor, image in (C, H, W) format.
                * "instances": Instances
                * "sem_seg": semantic segmentation ground truth.
                * Other information that's included in the original dicts, such as:
                  "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.

        Returns:
            list[dict]:
                each dict is the results for one image. The dict contains the following keys:

                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
                * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
                  See the return value of
                  :func:`combine_semantic_and_instance_outputs` for its format.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        images = ImageList.from_tensors(images, SIZE_DIVISIBILITY)
        score_sem, score_inst, score_conf = self.seg_model(images.tensor)

        h, w = images.tensor.size(2), images.tensor.size(3)
        score_inst = F.upsample(input=score_inst, size=(h, w), mode='bilinear')
        score_sem = F.upsample(input=score_sem, size=(h, w), mode='bilinear')

        score_conf_softmax = self.softmax_layer(score_conf)

        score_inst_sig = self.sigmoid_layer(score_inst)
        score_inst_sig_stuff = score_inst_sig[:, :BACKGROUND_NUM]
        score_inst_sig_thing = score_inst_sig[:, BACKGROUND_NUM:]

        if "sem_seg" in batched_inputs[0]:
            gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
            gt_sem_seg = ImageList.from_tensors(gt_sem_seg, SIZE_DIVISIBILITY,
                                                IGNORE_LABEL_SEM).tensor
        else:
            gt_sem_seg = None

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None
        #pdb.set_trace()
        if self.training:
            assert (gt_sem_seg - 1 < 0).sum() == 0
            sem_seg_losses = self.criterion_sem(score_sem, gt_sem_seg - 1)

            gt_sem_seg[gt_sem_seg > BACKGROUND_NUM] = 0
            gt_stuff = F.one_hot(gt_sem_seg,
                                 num_classes=BACKGROUND_NUM + 1).permute(
                                     0, 3, 1, 2)
            gt_stuff = gt_stuff[:, 1:]

            num_inst = sum(
                [len(gt_instances[i]) for i in range(len(gt_instances))])
            num_inst = torch.as_tensor([num_inst],
                                       dtype=torch.float,
                                       device=self.device)
            if is_dist_avail_and_initialized():
                torch.distributed.all_reduce(num_inst)
            num_inst = torch.clamp(num_inst / get_world_size(), min=1).item()

            loss_stuff_dice = 0.
            loss_thing_dice = 0.
            loss_stuff_focal = 0.
            loss_conf = 0.

            for i in range(len(batched_inputs)):
                gt_inst = gt_instances[i]
                gt_classes = gt_inst.gt_classes

                if gt_inst.has('gt_masks'):
                    gt_masks = gt_inst.gt_masks
                    masks = torch.stack([
                        torch.from_numpy(
                            polygons_to_bitmask(poly, gt_inst.image_size[0],
                                                gt_inst.image_size[1])).to(
                                                    self.device)
                        for poly in gt_masks.polygons
                    ], 0)
                    masks_pad = masks.new_full(
                        (masks.shape[0], images.tensor.shape[-2],
                         images.tensor.shape[-1]), False)
                    masks_pad[:, :masks.shape[-2], :masks.shape[-1]].copy_(
                        masks)
                else:
                    masks_pad = torch.zeros(
                        [0, images.tensor.shape[-2], images.tensor.shape[-1]],
                        dtype=torch.bool,
                        device=self.device)

                row_ind, col_ind = MatchDice(score_inst_sig_thing[i:i + 1],
                                             torch.unsqueeze(masks_pad, 0),
                                             score_conf_softmax[i:i + 1],
                                             gt_classes)
                col_ind_empty = np.setdiff1d(
                    np.arange(score_inst_sig_thing[i:i + 1].shape[1]), col_ind)

                score_inst_sig_perm = torch.cat(
                    (score_inst_sig_stuff[i],
                     score_inst_sig_thing[i, col_ind, :, :]), 0)

                target_inst_perm = torch.cat(
                    (gt_stuff[i].float(), masks_pad[row_ind].float()), 0)

                loss_stuff_dice_tmp, loss_thing_dice_tmp = dice_loss(
                    score_inst_sig_perm,
                    target_inst_perm,
                    num_inst,
                    background_channels=BACKGROUND_NUM,
                    valid_mask=None,
                    sigmoid_clip=True)
                loss_stuff_dice += loss_stuff_dice_tmp
                loss_thing_dice += loss_thing_dice_tmp

                target_conf = gt_classes.new_full((score_conf.shape[1], ),
                                                  FOREGROUND_NUM)
                target_conf[:len(gt_classes[row_ind])] = gt_classes[row_ind]
                loss_conf_tmp = conf_loss(torch.cat(
                    (score_conf[i, col_ind], score_conf[i, col_ind_empty]), 0),
                                          target_conf.long(),
                                          neg_factor=10,
                                          neg_idx=FOREGROUND_NUM)
                loss_conf += loss_conf_tmp

                loss_stuff_focal_tmp = focal_loss(score_inst_sig_stuff[i],
                                                  gt_stuff[i].float(),
                                                  valid_mask=None,
                                                  sigmoid_clip=True)
                loss_stuff_focal += loss_stuff_focal_tmp

            loss_stuff_focal = loss_stuff_focal / len(batched_inputs)
            loss_stuff_dice = loss_stuff_dice / len(batched_inputs)
            loss_conf = loss_conf / len(batched_inputs)

            loss_stuff_focal = loss_stuff_focal * 100.
            loss_conf = loss_conf * 5

            losses = {}
            losses.update({"loss_sem_seg": sem_seg_losses})
            losses.update({"loss_stuff_focal": loss_stuff_focal})
            losses.update({"loss_stuff_dice": loss_stuff_dice})
            losses.update({"loss_thing_dice": loss_thing_dice})
            losses.update({"loss_conf": loss_conf})
            return losses

        score_sem_null = score_sem.new_full(
            (score_sem.shape[0], 1, score_sem.shape[-2], score_sem.shape[-1]),
            -1000.)
        processed_results = []
        for i in range(len(batched_inputs)):
            height = batched_inputs[i].get("height", images.image_sizes[i][0])
            width = batched_inputs[i].get("width", images.image_sizes[i][1])

            score_inst_sig_stuff_b = F.interpolate(score_inst_sig_stuff[
                i:i +
                1, :, :images.image_sizes[i][0], :images.image_sizes[i][1]],
                                                   size=(height, width),
                                                   mode="bilinear",
                                                   align_corners=False)
            score_inst_sig_thing_b = F.interpolate(score_inst_sig_thing[
                i:i +
                1, :, :images.image_sizes[i][0], :images.image_sizes[i][1]],
                                                   size=(height, width),
                                                   mode="bilinear",
                                                   align_corners=False)

            img_name = os.path.basename(batched_inputs[i]['file_name'])
            img_name_split = img_name.split('.')
            save_dir = '/home/yz9244/detectron2/output/vis_inst_sig'
            for j in range(80):
                pred_inst_tmp = np.asarray(
                    255 * (score_inst_sig_thing_b[0, j].cpu().numpy()),
                    dtype=np.uint8)
                img = Image.fromarray(pred_inst_tmp)
                save_img = Image.new('RGB', (img.width, 2 * img.height))

                img = Image.fromarray(pred_inst_tmp)
                save_img.paste(img, (0, 0))

                pred_inst_tmp = np.asarray(255 * (pred_inst_tmp > 127),
                                           dtype=np.uint8)
                img = Image.fromarray(pred_inst_tmp)
                save_img.paste(img, (0, img.height))
                save_img.save(
                    os.path.join(save_dir,
                                 img_name_split[0] + '_%02d.png' % (j)))

            res = {}

            score_sem_foreground = torch.log(
                torch.exp(score_sem[i:i + 1,
                                    BACKGROUND_NUM:]).sum(dim=1, keepdim=True))
            sem_seg_result = torch.cat(
                (score_sem_foreground, score_sem[i:i + 1, :BACKGROUND_NUM]), 1)
            sem_seg_r = sem_seg_postprocess(sem_seg_result[0],
                                            images.image_sizes[i], height,
                                            width)

            res.update({"sem_seg": sem_seg_r})

            result = Instances((height, width))
            inst_sem_id = torch.argmax(score_conf_softmax[i], dim=1)
            scores = score_conf_softmax[i,
                                        range(score_conf.shape[1]),
                                        inst_sem_id]
            scores = scores[inst_sem_id != FOREGROUND_NUM]
            pred_classes = inst_sem_id[inst_sem_id != FOREGROUND_NUM]
            pred_masks = score_inst_sig_thing_b[0,
                                                inst_sem_id != FOREGROUND_NUM]

            pred_mask_sum = torch.sum(pred_masks > 0.5, (1, 2))
            result.pred_masks = pred_masks[pred_mask_sum > 0] > 0.5
            result.pred_classes = pred_classes[pred_mask_sum > 0]
            result.scores = scores[pred_mask_sum > 0]

            box_tmp = torch.zeros(result.pred_masks.shape[0], 4)
            for j in range(result.pred_masks.shape[0]):
                nonzero_idx = torch.nonzero(result.pred_masks[j])
                box_tmp[j, 0] = nonzero_idx[:, 1].min().item()
                box_tmp[j, 2] = nonzero_idx[:, 1].max().item()
                box_tmp[j, 1] = nonzero_idx[:, 0].min().item()
                box_tmp[j, 3] = nonzero_idx[:, 0].max().item()
            result.pred_boxes = Boxes(box_tmp)

            #detector_r = detector_postprocess(result, height, width)
            detector_r = result
            res.update({"instances": detector_r})

            panoptic_r = combine_semantic_and_instance_outputs(
                result.scores, result.pred_classes,
                pred_masks[pred_mask_sum > 0], score_inst_sig_stuff_b[0])
            res.update({"panoptic_seg": panoptic_r})

            processed_results.append(res)

        return processed_results
예제 #8
0
def get_single_image_mesh_depth(
    depth, segmentations, img_file, height=480, width=640, webvis=True
):
    if type(segmentations[0]) == dict:
        poly_segmentations = rle2polygon(segmentations)
    else:
        poly_segmentations = segmentations
    verts_list = []
    faces_list = []
    verts_uvs = []
    img_files = []
    imgs = []

    for segm in poly_segmentations:
        if len(segm) == 0:
            continue
        verts_3d = []
        faces = []
        uvs = []
        bitmask = polygons_to_bitmask(segm, height=height, width=width)
        verts = np.transpose(bitmask.nonzero())
        vert_id_map = defaultdict(dict)
        for idx, vert in enumerate(verts):
            vert_id_map[vert[0]][vert[1]] = idx + len(verts_3d)
        pcd = get_pcd_depth(verts[:, ::-1], depth.T)
        if webvis:
            # Rotate by 11 degree around x axis to push things on the ground.
            pcd = (
                np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]])
                @ np.array(
                    [[1, 0, 0], [0, 0.9816272, -0.1908090], [0, 0.1908090, 0.9816272]]
                )
                @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
                @ pcd.T
            ).T
        triangles = []
        for vert in verts:
            # upper right triangle
            if (
                vert[0] < height - 1
                and vert[1] < width - 1
                and bitmask[vert[0]][vert[1] + 1]
                and bitmask[vert[0] + 1][vert[1] + 1]
            ):
                triangles.append(
                    [
                        vert_id_map[vert[0]][vert[1]],
                        vert_id_map[vert[0] + 1][vert[1] + 1],
                        vert_id_map[vert[0]][vert[1] + 1],
                    ]
                )
            # bottom left triangle
            if (
                vert[0] < height - 1
                and vert[1] < width - 1
                and bitmask[vert[0] + 1][vert[1]]
                and bitmask[vert[0] + 1][vert[1] + 1]
            ):
                triangles.append(
                    [
                        vert_id_map[vert[0]][vert[1]],
                        vert_id_map[vert[0] + 1][vert[1]],
                        vert_id_map[vert[0] + 1][vert[1] + 1],
                    ]
                )
        triangles = np.array(triangles)
        verts_3d.extend(pcd)
        faces.extend(triangles)
        uvs.extend(
            np.array([0, 1])
            + np.array([1, -1]) * verts[:, ::-1] / np.array([width, height])
        )
        verts_list.append(torch.tensor(verts_3d, dtype=torch.float32))
        faces_list.append(torch.tensor(faces, dtype=torch.int32))
        verts_uvs.append(torch.tensor(uvs, dtype=torch.float32))
        img_files.append(img_file)
        imgs.append(torch.FloatTensor(imageio.imread(img_file)))
    verts_uvs = pad_sequence(verts_uvs, batch_first=True)
    faces_uvs = pad_sequence(faces_list, batch_first=True, padding_value=-1)
    tex = Textures(verts_uvs=verts_uvs, faces_uvs=faces_uvs, maps=imgs)

    # Initialise the mesh with textures
    meshes = Meshes(verts=verts_list, faces=faces_list, textures=tex)
    return meshes, img_files
예제 #9
0
def get_single_image_mesh(
    plane_params,
    segmentations,
    img_file,
    height=480,
    width=640,
    focal_length=517.97,
    webvis=False,
    reduce_size=True,
):
    plane_params = np.array(plane_params)
    offsets = np.linalg.norm(plane_params, ord=2, axis=1)
    norms = plane_params / offsets.reshape(-1, 1)

    if type(segmentations[0]) == dict:
        poly_segmentations = rle2polygon(segmentations)
    else:
        poly_segmentations = segmentations
    verts_list = []
    faces_list = []
    verts_uvs = []
    img_files = []
    imgs = []

    for segm, normal, offset in zip(poly_segmentations, norms, offsets):
        if len(segm) == 0:
            continue
        verts_3d = []
        faces = []
        uvs = []
        if reduce_size:
            for ring in segm:
                verts = np.array(ring).reshape(-1, 2)
                # get 3d pointcloud
                pcd = get_pcd(verts, normal, offset, focal_length)
                if webvis:
                    # Rotate by 11 degree around x axis to push things on the ground.
                    pcd = (
                        np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]])
                        @ np.array(
                            [
                                [1, 0, 0],
                                [0, 0.9816272, -0.1908090],
                                [0, 0.1908090, 0.9816272],
                            ]
                        )
                        @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
                        @ pcd.T
                    ).T
                # triangulate polygon using earcut algorithm
                triangles = earcut.triangulate_float32(verts, [len(verts)])
                # add base index of vertice
                triangles += len(verts_3d)
                triangles = triangles.reshape(-1, 3)
                # convert to counter-clockwise
                triangles[:, [0, 2]] = triangles[:, [2, 0]]
                verts_3d.extend(pcd)
                faces.extend(triangles)
                uvs.extend(
                    np.array([0, 1])
                    + np.array([1, -1]) * verts / np.array([width, height])
                )

        else:
            bitmask = polygons_to_bitmask(segm, height=height, width=width)
            verts = np.transpose(bitmask.nonzero())
            vert_id_map = defaultdict(dict)
            for idx, vert in enumerate(verts):
                vert_id_map[vert[0]][vert[1]] = idx + len(verts_3d)

            verts_3d = get_pcd(verts[:, ::-1], normal, offset)
            if webvis:
                # Rotate by 11 degree around x axis to push things on the ground.
                verts_3d = (
                    np.array([[-1, 0, 0], [0, 1, 0], [0, 0, -1]])
                    @ np.array(
                        [
                            [1, 0, 0],
                            [0, 0.9816272, -0.1908090],
                            [0, 0.1908090, 0.9816272],
                        ]
                    )
                    @ np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
                    @ pcd.T
                ).T
            triangles = []
            for vert in verts:
                # upper right triangle
                if (
                    vert[0] < height - 1
                    and vert[1] < width - 1
                    and bitmask[vert[0]][vert[1] + 1]
                    and bitmask[vert[0] + 1][vert[1] + 1]
                ):
                    triangles.append(
                        [
                            vert_id_map[vert[0]][vert[1]],
                            vert_id_map[vert[0] + 1][vert[1] + 1],
                            vert_id_map[vert[0]][vert[1] + 1],
                        ]
                    )
                # bottom left triangle
                if (
                    vert[0] < height - 1
                    and vert[1] < width - 1
                    and bitmask[vert[0] + 1][vert[1]]
                    and bitmask[vert[0] + 1][vert[1] + 1]
                ):
                    triangles.append(
                        [
                            vert_id_map[vert[0]][vert[1]],
                            vert_id_map[vert[0] + 1][vert[1]],
                            vert_id_map[vert[0] + 1][vert[1] + 1],
                        ]
                    )
            triangles = np.array(triangles)
            faces.extend(triangles)
            uvs.extend(
                np.array([0, 1])
                + np.array([1, -1]) * verts[:, ::-1] / np.array([width, height])
            )
        verts_list.append(torch.tensor(verts_3d, dtype=torch.float32))
        faces_list.append(torch.tensor(faces, dtype=torch.int32))
        verts_uvs.append(torch.tensor(uvs, dtype=torch.float32))
        img_files.append(img_file)
        imgs.append(torch.FloatTensor(imageio.imread(img_file)))
    verts_uvs = pad_sequence(verts_uvs, batch_first=True)
    faces_uvs = pad_sequence(faces_list, batch_first=True, padding_value=-1)

    tex = Textures(verts_uvs=verts_uvs, faces_uvs=faces_uvs, maps=imgs)

    # Initialise the mesh with textures
    meshes = Meshes(verts=verts_list, faces=faces_list, textures=tex)
    return meshes, img_files