Exemplo n.º 1
0
    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns: entry dict
        """
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]['file_name']
        image_unpadded = Image.open(os.path.join(self.root,
                                                 path)).convert('RGB')
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        gt_classes = np.array([self.id_to_ind[x['category_id']] for x in anns],
                              dtype=np.int64)

        if np.any(gt_classes >= len(self.ind_to_classes)):
            raise ValueError("OH NO {}".format(index))

        if len(anns) == 0:
            raise ValueError("Annotations should not be empty")
        #     gt_boxes = np.array((0, 4), dtype=np.float32)
        # else:
        gt_boxes = np.array([x['bbox'] for x in anns], dtype=np.float32)

        if np.any(gt_boxes[:, [0, 1]] < 0):
            raise ValueError("GT boxes empty columns")
        if np.any(gt_boxes[:, [2, 3]] < 0):
            raise ValueError("GT boxes empty h/w")
        gt_boxes[:, [2, 3]] += gt_boxes[:, [0, 1]]

        # Rescale so that the boxes are at BOX_SCALE
        if self.is_train:
            image_unpadded, gt_boxes = random_crop(
                image_unpadded,
                gt_boxes * BOX_SCALE / max(image_unpadded.size),
                BOX_SCALE,
                round_boxes=False,
            )
        else:
            # Seems a bit silly because we won't be using GT boxes then but whatever
            gt_boxes = gt_boxes * BOX_SCALE / max(image_unpadded.size)
        w, h = image_unpadded.size
        box_scale_factor = BOX_SCALE / max(w, h)

        # Optionally flip the image if we're doing training
        flipped = self.is_train and np.random.random() > 0.5
        if flipped:
            scaled_w = int(box_scale_factor * float(w))
            image_unpadded = image_unpadded.transpose(Image.FLIP_LEFT_RIGHT)
            gt_boxes[:, [0, 2]] = scaled_w - gt_boxes[:, [2, 0]]

        img_scale_factor = IM_SCALE / max(w, h)
        if h > w:
            im_size = (IM_SCALE, int(w * img_scale_factor), img_scale_factor)
        elif h < w:
            im_size = (int(h * img_scale_factor), IM_SCALE, img_scale_factor)
        else:
            im_size = (IM_SCALE, IM_SCALE, img_scale_factor)

        entry = {
            'img': self.transform_pipeline(image_unpadded),
            'img_size': im_size,
            'gt_boxes': gt_boxes,
            'gt_classes': gt_classes,
            'scale': IM_SCALE / BOX_SCALE,
            'index': index,
            'image_id': img_id,
            'flipped': flipped,
            'fn': path,
        }

        return entry
Exemplo n.º 2
0
    def __getitem__(self, index):
        """
    Get the pixels of an image, and a random synthetic scene graph for that
    image constructed on-the-fly from its COCO object annotations. We assume
    that the image will have height H, width W, C channels; there will be O
    object annotations, each of which will have both a bounding box and a
    segmentation mask of shape (M, M). There will be T triples in the scene
    graph.

    Returns a tuple of:
    - image: FloatTensor of shape (C, H, W)
    - objs: LongTensor of shape (O,)
    - boxes: FloatTensor of shape (O, 4) giving boxes for objects in
      (x0, y0, x1, y1) format, in a [0, 1] coordinate system
    - masks: LongTensor of shape (O, M, M) giving segmentation masks for
      objects, where 0 is background and 1 is object.
    - triples: LongTensor of shape (T, 3) where triples[t] = [i, p, j]
      means that (objs[i], p, objs[j]) is a triple.
    """
        image_id = self.ids[index]

        filename = self.image_id_to_filename[image_id]
        image_path = os.path.join(self.image_dir, filename)
        with open(image_path, 'rb') as f:
            with PIL.Image.open(f) as image:
                WW, HH = image.size
                # image = self.transform(image.convert('RGB'))
                image_unpadded = image.convert('RGB')

        # H, W = self.image_size
        objs, boxes, masks = [], [], []
        for object_data in self.image_id_to_objects[image_id]:
            objs.append(object_data['category_id'])
            x, y, w, h = object_data['bbox']
            # x0 = x / WW
            # y0 = y / HH
            # x1 = (x + w) / WW
            # y1 = (y + h) / HH
            x0 = x
            y0 = y
            x1 = (x + w)
            y1 = (y + h)
            # boxes.append(torch.FloatTensor([x0, y0, x1, y1]))
            boxes.append([x0, y0, x1, y1])

            # # This will give a numpy array of shape (HH, WW)
            # mask = seg_to_mask(object_data['segmentation'], WW, HH)
            #
            # # Crop the mask according to the bounding box, being careful to
            # # ensure that we don't crop a zero-area region
            # mx0, mx1 = int(round(x)), int(round(x + w))
            # my0, my1 = int(round(y)), int(round(y + h))
            # mx1 = max(mx0 + 1, mx1)
            # my1 = max(my0 + 1, my1)
            # mask = mask[my0:my1, mx0:mx1]
            # mask = imresize(255.0 * mask, (self.mask_size, self.mask_size),
            #                 mode='constant')
            # mask = torch.from_numpy((mask > 128).astype(np.int64))
            # masks.append(mask)

        # # Add dummy __image__ object
        # objs.append(self.vocab['object_name_to_idx']['__image__'])
        # boxes.append(torch.FloatTensor([0, 0, 1, 1]))
        # masks.append(torch.ones(self.mask_size, self.mask_size).long())

        # objs = torch.LongTensor(objs)
        # boxes = torch.stack(boxes, dim=0)
        gt_classes = np.array(objs, dtype=np.int64)
        gt_boxes = np.array(boxes, dtype=np.float32)
        # masks = torch.stack(masks, dim=0)
        #
        # box_areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        #
        # # Compute centers of all objects
        # obj_centers = []
        # _, MH, MW = masks.size()
        # for i, obj_idx in enumerate(objs):
        #   x0, y0, x1, y1 = boxes[i]
        #   mask = (masks[i] == 1)
        #   xs = torch.linspace(x0, x1, MW).view(1, MW).expand(MH, MW)
        #   ys = torch.linspace(y0, y1, MH).view(MH, 1).expand(MH, MW)
        #   if mask.sum() == 0:
        #     mean_x = 0.5 * (x0 + x1)
        #     mean_y = 0.5 * (y0 + y1)
        #   else:
        #     mean_x = xs[mask].mean()
        #     mean_y = ys[mask].mean()
        #   obj_centers.append([mean_x, mean_y])
        # obj_centers = torch.FloatTensor(obj_centers)
        #
        # # Add triples
        # triples = []
        # num_objs = objs.size(0)
        # __image__ = self.vocab['object_name_to_idx']['__image__']
        # real_objs = []
        # if num_objs > 1:
        #   real_objs = (objs != __image__).nonzero().squeeze(1)
        # for cur in real_objs:
        #   choices = [obj for obj in real_objs if obj != cur]
        #   if len(choices) == 0 or not self.include_relationships:
        #     break
        #   other = random.choice(choices)
        #   if random.random() > 0.5:
        #     s, o = cur, other
        #   else:
        #     s, o = other, cur
        #
        #   # Check for inside / surrounding
        #   sx0, sy0, sx1, sy1 = boxes[s]
        #   ox0, oy0, ox1, oy1 = boxes[o]
        #   d = obj_centers[s] - obj_centers[o]
        #   theta = math.atan2(d[1], d[0])
        #
        #   if sx0 < ox0 and sx1 > ox1 and sy0 < oy0 and sy1 > oy1:
        #     p = 'surrounding'
        #   elif sx0 > ox0 and sx1 < ox1 and sy0 > oy0 and sy1 < oy1:
        #     p = 'inside'
        #   elif theta >= 3 * math.pi / 4 or theta <= -3 * math.pi / 4:
        #     p = 'left of'
        #   elif -3 * math.pi / 4 <= theta < -math.pi / 4:
        #     p = 'above'
        #   elif -math.pi / 4 <= theta < math.pi / 4:
        #     p = 'right of'
        #   elif math.pi / 4 <= theta < 3 * math.pi / 4:
        #     p = 'below'
        #   p = self.vocab['pred_name_to_idx'][p]
        #   triples.append([s, p, o])
        #
        # # Add __in_image__ triples
        # O = objs.size(0)
        # in_image = self.vocab['pred_name_to_idx']['__in_image__']
        # for i in range(O - 1):
        #   triples.append([i, in_image, O - 1])
        #
        # triples = torch.LongTensor(triples)
        # return image, objs, boxes, masks, triples

        # Rescale so that the boxes are at BOX_SCALE
        if self.is_train:
            image_unpadded, gt_boxes = random_crop(
                image_unpadded,
                gt_boxes * BOX_SCALE / max(image_unpadded.size),
                BOX_SCALE,
                round_boxes=False,
            )
        else:
            # Seems a bit silly because we won't be using GT boxes then but whatever
            gt_boxes = gt_boxes * BOX_SCALE / max(image_unpadded.size)
        w, h = image_unpadded.size
        box_scale_factor = BOX_SCALE / max(w, h)

        # Optionally flip the image if we're doing training
        flipped = self.is_train and np.random.random() > 0.5
        if flipped:
            scaled_w = int(box_scale_factor * float(w))
            image_unpadded = image_unpadded.transpose(
                PIL.Image.FLIP_LEFT_RIGHT)
            gt_boxes[:, [0, 2]] = scaled_w - gt_boxes[:, [2, 0]]

        img_scale_factor = IM_SCALE / max(w, h)
        if h > w:
            im_size = (IM_SCALE, int(w * img_scale_factor), img_scale_factor)
        elif h < w:
            im_size = (int(h * img_scale_factor), IM_SCALE, img_scale_factor)
        else:
            im_size = (IM_SCALE, IM_SCALE, img_scale_factor)

        entry = {
            'img': self.transform_pipeline(image_unpadded),
            'img_size': im_size,
            'gt_boxes': gt_boxes,
            'gt_classes': gt_classes,
            'scale': IM_SCALE / BOX_SCALE,
            'index': index,
            'image_id': image_id,
            'flipped': flipped,
            'fn': image_path,
        }

        return entry