예제 #1
0
 def get_image(self, idx):
     img_filename = os.path.join(self.image_dir, '%06d.jpg' % (idx))
     return sunrgbd_utils.load_image(img_filename)
예제 #2
0
    def __getitem__(self, idx):
        """
        Returns a dict with following keys:
            point_clouds: (N,3+C)
            center_label: (MAX_NUM_OBJ,3) for GT box center XYZ
            heading_class_label: (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
            heading_residual_label: (MAX_NUM_OBJ,)
            size_classe_label: (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
            size_residual_label: (MAX_NUM_OBJ,3)
            sem_cls_label: (MAX_NUM_OBJ,) semantic class index
            box_label_mask: (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
            vote_label: (N,9) with votes XYZ (3 votes: X1Y1Z1, X2Y2Z2, X3Y3Z3)
                if there is only one vote than X1==X2==X3 etc.
            vote_label_mask: (N,) with 0/1 with 1 indicating the point
                is in one of the object's OBB.
            scan_idx: int scan index in scan_names list
            max_gt_bboxes: unused
        """
        scan_name = self.scan_names[idx]
        point_cloud = np.load(os.path.join(self.data_path, scan_name)+'_pc.npz')['pc'] # Nx6
        bboxes = np.load(os.path.join(self.data_path, scan_name)+'_bbox.npy') # K,8
        point_votes = np.load(os.path.join(self.data_path, scan_name)+'_votes.npz')['point_votes'] # Nx10
        if self.use_imvote:
            # Read camera parameters
            calib_lines = [line for line in open(os.path.join(self.raw_data_path, 'calib', scan_name+'.txt')).readlines()]
            calib_Rtilt = np.reshape(np.array([float(x) for x in calib_lines[0].rstrip().split(' ')]), (3,3), 'F')
            calib_K = np.reshape(np.array([float(x) for x in calib_lines[1].rstrip().split(' ')]), (3,3), 'F')
            # Read image
            full_img = sunrgbd_utils.load_image(os.path.join(self.raw_data_path, 'image', scan_name+'.jpg'))
            full_img_height = full_img.shape[0]
            full_img_width = full_img.shape[1]
            
            # ------------------------------- 2D IMAGE VOTES ------------------------------
            cls_id_list = self.cls_id_map[scan_name]
            cls_score_list = self.cls_score_map[scan_name]
            bbox_2d_list = self.bbox_2d_map[scan_name]
            obj_img_list = []
            for i2d, (cls2d, box2d) in enumerate(zip(cls_id_list, bbox_2d_list)):
                xmin, ymin, xmax, ymax = box2d
                # During training we randomly drop 2D boxes to reduce over-fitting
                if self.train and np.random.random()>0.5:
                    continue

                obj_img = full_img[ymin:ymax, xmin:xmax, :]
                obj_h = obj_img.shape[0]
                obj_w = obj_img.shape[1]
                # Bounding box coordinates (4 values), class id, index to the semantic cues
                meta_data = (xmin, ymin, obj_h, obj_w, cls2d, i2d)
                if obj_h == 0 or obj_w == 0:
                    continue

                # Use 2D box center as approximation
                uv_centroid = np.array([int(obj_w/2), int(obj_h/2)])
                uv_centroid = np.expand_dims(uv_centroid, 0)

                v_coords, u_coords = np.meshgrid(range(obj_h), range(obj_w), indexing='ij')
                img_vote = np.transpose(np.array([u_coords, v_coords]), (1,2,0))
                img_vote = np.expand_dims(uv_centroid, 0) - img_vote 

                obj_img_list.append((meta_data, img_vote))

            full_img_votes = np.zeros((full_img_height,full_img_width,self.vote_dims), dtype=np.float32)
            # Empty votes: 2d box index is set to -1
            full_img_votes[:,:,3::4] = -1.

            for obj_img_data in obj_img_list:
                meta_data, img_vote = obj_img_data
                u0, v0, h, w, cls2d, i2d = meta_data
                for u in range(u0, u0+w):
                    for v in range(v0, v0+h):
                        iidx = int(full_img_votes[v,u,0])
                        if iidx >= self.max_imvote_per_pixel: 
                            continue
                        full_img_votes[v,u,(1+iidx*4):(1+iidx*4+2)] = img_vote[v-v0,u-u0,:]
                        full_img_votes[v,u,(1+iidx*4+2)] = cls2d
                        full_img_votes[v,u,(1+iidx*4+3)] = i2d + 1 # add +1 here as we need a dummy feature for pixels outside all boxes
                full_img_votes[v0:(v0+h), u0:(u0+w), 0] += 1

            full_img_votes_1d = np.zeros((MAX_NUM_PIXEL*self.vote_dims), dtype=np.float32)
            full_img_votes_1d[0:full_img_height*full_img_width*self.vote_dims] = full_img_votes.flatten()

            # Semantic cues: one-hot vector for class scores
            cls_score_feats = np.zeros((1+MAX_NUM_2D_DET,NUM_CLS), dtype=np.float32)
            # First row is dumpy feature
            len_obj = len(cls_id_list)
            if len_obj:
                ind_obj = np.arange(1,len_obj+1)
                ind_cls = np.array(cls_id_list)
                cls_score_feats[ind_obj, ind_cls] = np.array(cls_score_list)

            # Texture cues: normalized RGB values
            full_img = (full_img - 128.) / 255.
            # Serialize data to 1D and save image size so that we can recover the original location in the image
            full_img_1d = np.zeros((MAX_NUM_PIXEL*3), dtype=np.float32)
            full_img_1d[:full_img_height*full_img_width*3] = full_img.flatten()

        if not self.use_color:
            point_cloud = point_cloud[:,0:3]
        else:
            point_cloud = point_cloud[:,0:6]
            point_cloud[:,3:] = (point_cloud[:,3:]-MEAN_COLOR_RGB)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:,2],0.99)
            height = point_cloud[:,2] - floor_height
            point_cloud = np.concatenate([point_cloud, np.expand_dims(height, 1)],1) # (N,4) or (N,7)

        # ------------------------------- DATA AUGMENTATION ------------------------------
        scale_ratio = 1.
        if self.augment:
            flip_flag = (np.random.random()>0.5)
            if flip_flag:
                # Flipping along the YZ plane
                point_cloud[:,0] = -1 * point_cloud[:,0]
                bboxes[:,0] = -1 * bboxes[:,0]
                bboxes[:,6] = np.pi - bboxes[:,6]
                point_votes[:,[1,4,7]] = -1 * point_votes[:,[1,4,7]]

            # Rotation along up-axis/Z-axis
            rot_angle = (np.random.random()*np.pi/3) - np.pi/6 # -30 ~ +30 degree
            rot_mat = sunrgbd_utils.rotz(rot_angle)

            point_votes_end = np.zeros_like(point_votes)
            point_votes_end[:,1:4] = np.dot(point_cloud[:,0:3] + point_votes[:,1:4], np.transpose(rot_mat))
            point_votes_end[:,4:7] = np.dot(point_cloud[:,0:3] + point_votes[:,4:7], np.transpose(rot_mat))
            point_votes_end[:,7:10] = np.dot(point_cloud[:,0:3] + point_votes[:,7:10], np.transpose(rot_mat))

            point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
            bboxes[:,0:3] = np.dot(bboxes[:,0:3], np.transpose(rot_mat))
            bboxes[:,6] -= rot_angle
            point_votes[:,1:4] = point_votes_end[:,1:4] - point_cloud[:,0:3]
            point_votes[:,4:7] = point_votes_end[:,4:7] - point_cloud[:,0:3]
            point_votes[:,7:10] = point_votes_end[:,7:10] - point_cloud[:,0:3]

            if self.use_imvote:
                R_inverse = np.copy(np.transpose(rot_mat))
                if flip_flag:
                    R_inverse[0,:] *= -1
                # Update Rtilt according to the augmentation
                # R_inverse (3x3) * point (3x1) transforms an augmented depth point
                # to original point in upright_depth coordinates
                calib_Rtilt = np.dot(np.transpose(R_inverse), calib_Rtilt) 

            # Augment RGB color
            if self.use_color:
                rgb_color = point_cloud[:,3:6] + MEAN_COLOR_RGB
                rgb_color *= (1+0.4*np.random.random(3)-0.2) # brightness change for each channel
                rgb_color += (0.1*np.random.random(3)-0.05) # color shift for each channel
                rgb_color += np.expand_dims((0.05*np.random.random(point_cloud.shape[0])-0.025), -1) # jittering on each pixel
                rgb_color = np.clip(rgb_color, 0, 1)
                # randomly drop out 30% of the points' colors
                rgb_color *= np.expand_dims(np.random.random(point_cloud.shape[0])>0.3,-1)
                point_cloud[:,3:6] = rgb_color - MEAN_COLOR_RGB

            # Augment point cloud scale: 0.85x-1.15x
            scale_ratio = np.random.random()*0.3+0.85
            if self.use_imvote:
                calib_Rtilt = np.dot(np.array([[scale_ratio,0,0],[0,scale_ratio,0],[0,0,scale_ratio]]), calib_Rtilt)
            scale_ratio_expand = np.expand_dims(np.tile(scale_ratio,3),0)
            point_cloud[:,0:3] *= scale_ratio_expand
            bboxes[:,0:3] *= scale_ratio_expand
            bboxes[:,3:6] *= scale_ratio_expand
            point_votes[:,1:4] *= scale_ratio_expand
            point_votes[:,4:7] *= scale_ratio_expand
            point_votes[:,7:10] *= scale_ratio_expand
            if self.use_height:
                point_cloud[:,-1] *= scale_ratio

        # ------------------------------- LABELS ------------------------------
        box3d_centers = np.zeros((MAX_NUM_OBJ, 3))
        box3d_sizes = np.zeros((MAX_NUM_OBJ, 3))
        angle_classes = np.zeros((MAX_NUM_OBJ,))
        angle_residuals = np.zeros((MAX_NUM_OBJ,))
        size_classes = np.zeros((MAX_NUM_OBJ,))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        label_mask = np.zeros((MAX_NUM_OBJ))
        label_mask[0:bboxes.shape[0]] = 1
        max_bboxes = np.zeros((MAX_NUM_OBJ, 8))
        max_bboxes[0:bboxes.shape[0],:] = bboxes

        for i in range(bboxes.shape[0]):
            bbox = bboxes[i]
            semantic_class = bbox[7]
            box3d_center = bbox[0:3]
            angle_class, angle_residual = DC.angle2class(bbox[6])
            # NOTE: The mean size stored in size2class is of full length of box edges,
            # while in sunrgbd_data.py data dumping we dumped *half* length l,w,h.. so have to time it by 2 here 
            box3d_size = bbox[3:6]*2
            size_class, size_residual = DC.size2class(box3d_size, DC.class2type[semantic_class])
            box3d_centers[i,:] = box3d_center
            angle_classes[i] = angle_class
            angle_residuals[i] = angle_residual
            size_classes[i] = size_class
            size_residuals[i] = size_residual
            box3d_sizes[i,:] = box3d_size

        target_bboxes_mask = label_mask 
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        for i in range(bboxes.shape[0]):
            bbox = bboxes[i]
            corners_3d = sunrgbd_utils.my_compute_box_3d(bbox[0:3], bbox[3:6], bbox[6])
            # compute axis aligned box
            xmin = np.min(corners_3d[:,0])
            ymin = np.min(corners_3d[:,1])
            zmin = np.min(corners_3d[:,2])
            xmax = np.max(corners_3d[:,0])
            ymax = np.max(corners_3d[:,1])
            zmax = np.max(corners_3d[:,2])
            target_bbox = np.array([(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2, xmax-xmin, ymax-ymin, zmax-zmin])
            target_bboxes[i,:] = target_bbox

        point_cloud, choices = pc_util.random_sampling(point_cloud, self.num_points, return_choices=True)
        point_votes_mask = point_votes[choices,0]
        point_votes = point_votes[choices,1:]

        ret_dict = {}
        ret_dict['point_clouds'] = point_cloud.astype(np.float32)
        ret_dict['center_label'] = target_bboxes.astype(np.float32)[:,0:3]
        ret_dict['heading_class_label'] = angle_classes.astype(np.int64)
        ret_dict['heading_residual_label'] = angle_residuals.astype(np.float32)
        ret_dict['size_class_label'] = size_classes.astype(np.int64)
        ret_dict['size_residual_label'] = size_residuals.astype(np.float32)
        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        target_bboxes_semcls[0:bboxes.shape[0]] = bboxes[:,-1] # from 0 to 9
        ret_dict['sem_cls_label'] = target_bboxes_semcls.astype(np.int64)
        ret_dict['box_label_mask'] = target_bboxes_mask.astype(np.float32)
        ret_dict['vote_label'] = point_votes.astype(np.float32)
        ret_dict['vote_label_mask'] = point_votes_mask.astype(np.int64)
        ret_dict['scan_idx'] = np.array(idx).astype(np.int64)
        ret_dict['max_gt_bboxes'] = max_bboxes
        if self.use_imvote:
            ret_dict['scale'] = np.array(scale_ratio).astype(np.float32)
            ret_dict['calib_Rtilt'] = calib_Rtilt.astype(np.float32)
            ret_dict['calib_K'] = calib_K.astype(np.float32)
            ret_dict['full_img_width'] = np.array(full_img_width).astype(np.int64)
            ret_dict['cls_score_feats'] = cls_score_feats.astype(np.float32)
            ret_dict['full_img_votes_1d'] = full_img_votes_1d.astype(np.float32)
            ret_dict['full_img_1d'] = full_img_1d.astype(np.float32)

        return ret_dict