def simple_setup_4_cams_pinhole(): vcd = core.VCD() vcd.add_coordinate_system("vehicle-iso8855", cs_type=types.CoordinateSystemType.local_cs) # Let's build the cameras img_width_px = 960 img_height_px = 604 ################################ # CAM_FRONT ################################ # Intrinsics # ------------------------------ # This matrix converts from scs to ics K_3x4 = np.array([[6.038e+002, 0.0, 4.67e+002, 0.0], [0.0, 6.038e+002, 2.94e+002, 0.0], [0.0, 0.0, 1.0, 0.0]]) d_1x5 = np.array([[ -4.0569640920796241e-001, 1.9116055332155032e-001, 0., 0., -4.7033609773998064e-002 ]]) # Extrinsics # ------------------------------ # Extrinsics (1/2): Rotation using yaw(Z), pitch(Y), roll(X). This is rotation of SCS wrt LCS pitch_rad = (10.0 * np.pi) / 180.0 yaw_rad = (0.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([ [2.3], # frontal part of the car [0.0], # centered in the symmetry axis of the car [1.3] ]) # at some height over the ground P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) # As explained in core.py, a pose matrix of scs wrt lcs can be also seen as the inverse transformation from lcs to scs T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) # Extrinsics (2/2): Additional rotation, because usually cameras scs are (x-to-right, y-down, z-to-front) # to match better with ics (x-to-right, y-bottom); while the lcs is (x-to-front, y-to-left, z-up) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) # Now we can compose the final transform T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_FRONT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_FRONT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list(K_3x4.flatten()), distortion_coeffs_1xN=list(d_1x5.flatten()))) vcd.add_coordinate_system("CAM_FRONT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_REAR ################################ K_3x4 = np.array([[6.038e+002, 0.0, 4.67e+002, 0.0], [0.0, 6.038e+002, 2.94e+002, 0.0], [0.0, 0.0, 1.0, 0.0]]) d_1x5 = np.array([[ -4.0569640920796241e-001, 1.9116055332155032e-001, 0., 0., -4.7033609773998064e-002 ]]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (180.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([ [-0.725], # rear part of the car [0.0], # centered in the symmetry axis of the car [0.4] ]) # at some height over the ground P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_REAR", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_REAR", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list(K_3x4.flatten()), distortion_coeffs_1xN=list(d_1x5.flatten()))) vcd.add_coordinate_system("CAM_REAR", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_LEFT ################################ K_3x4 = np.array([[6.038e+002, 0.0, 4.67e+002, 0.0], [0.0, 6.038e+002, 2.94e+002, 0.0], [0.0, 0.0, 1.0, 0.0]]) d_1x5 = np.array([[ -4.0569640920796241e-001, 1.9116055332155032e-001, 0., 0., -4.7033609773998064e-002 ]]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (90.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([ [1.2], [0.6], # at the left [1.45] ]) # at the roof P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_LEFT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_LEFT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list(K_3x4.flatten()), distortion_coeffs_1xN=list(d_1x5.flatten()))) vcd.add_coordinate_system("CAM_LEFT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_RIGHT ################################ K_3x4 = np.array([[6.038e+002, 0.0, 4.67e+002, 0.0], [0.0, 6.038e+002, 2.94e+002, 0.0], [0.0, 0.0, 1.0, 0.0]]) d_1x5 = np.array([[ -4.0569640920796241e-001, 1.9116055332155032e-001, 0., 0., -4.7033609773998064e-002 ]]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (-90.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([ [1.2], [-0.6], # at the left [1.45] ]) # at the roof P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_RIGHT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_RIGHT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list(K_3x4.flatten()), distortion_coeffs_1xN=list(d_1x5.flatten()))) vcd.add_coordinate_system("CAM_RIGHT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) return vcd
def parse_sequence(self, seq_number): vcd = core.VCD() ######################################### # OPEN files ######################################### calib_file_name = os.path.join(self.kitti_tracking_calib_path, str(seq_number).zfill(4) + ".txt") oxts_file_name = os.path.join(self.kitti_tracking_oxts_path, str(seq_number).zfill(4) + ".txt") object_file_name = os.path.join(self.kitti_tracking_objects_path, str(seq_number).zfill(4) + '.txt') calib_file = open(calib_file_name, newline='') oxts_file = open(oxts_file_name, newline='') object_file = open(object_file_name, newline='') calib_reader = csv.reader(calib_file, delimiter=' ') oxts_reader = csv.reader(oxts_file, delimiter=' ') object_reader = csv.reader(object_file, delimiter=' ') ######################################### # READ calibration matrices ######################################### img_width_px = 1236 img_height_px = 366 # these are rectified dimensions calib_matrices = {} for row in calib_reader: calib_matrices[row[0]] = [ float(x) for x in row[1:] if len(x) > 0 ] # filter out some spaces at the end of the row left_camera_K3x4 = np.reshape(calib_matrices["P2:"], (3, 4)) right_camera_K3x4 = np.reshape(calib_matrices["P3:"], (3, 4)) camera_rectification_3x3 = np.reshape(calib_matrices["R_rect"], (3, 3)) transform_velo_to_camleft_3x4 = np.reshape( calib_matrices["Tr_velo_cam"], (3, 4)) # WRT to LEFT CAMERA ONLY ######################################### # LIDAR info ######################################### # http://www.cvlibs.net/datasets/kitti/setup.php location_velo_wrt_lcs_3x1 = np.array( [[0.76], [0.0], [1.73]]) # according to the documentation # Create pose (p=[[R|C],[0001]]) pose_velo_wrt_lcs_4x4 = utils.create_pose( np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]), location_velo_wrt_lcs_3x1) transform_lcs_to_velo_4x4 = utils.inv(pose_velo_wrt_lcs_4x4) vcd.add_stream(stream_name="VELO_TOP", uri="", description="Velodyne roof", stream_type=core.StreamType.lidar) vcd.add_stream_properties( stream_name="VELO_TOP", extrinsics=types.Extrinsics( pose_scs_wrt_lcs_4x4=list(pose_velo_wrt_lcs_4x4.flatten()))) ######################################### # GPS/IMU info ######################################### # Let's build also the pose of the imu location_imu_wrt_lcs_4x4 = np.array([[-0.05], [0.32], [0.93] ]) # according to documentation pose_imu_wrt_lcs_4x4 = utils.create_pose( np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]), location_imu_wrt_lcs_4x4) vcd.add_stream(stream_name="IMU", uri="", description="GPS/IMU", stream_type=core.StreamType.other) vcd.add_stream_properties( stream_name="IMU", extrinsics=types.Extrinsics( pose_scs_wrt_lcs_4x4=list(pose_imu_wrt_lcs_4x4.flatten()))) ######################################### # CAMERAS ######################################### # From KITTI readme.txt: # To project a point from Velodyne coordinates into the left color image, # you can use this formula: x = P2 * R0_rect * Tr_velo_to_cam * y # For the right color image: x = P3 * R0_rect * Tr_velo_to_cam * y # Note: All matrices are stored row-major, i.e., the first values correspond # to the first row. R0_rect contains a 3x3 matrix which you need to extend to # a 4x4 matrix by adding a 1 as the bottom-right element and 0's elsewhere. # Tr_xxx is a 3x4 matrix (R|t), which you need to extend to a 4x4 matrix # in the same way! # Virtually, cam_left and cam_right are defined as the same coordinate systems, so their scs are the same # But their projection matrices (3x4) include a right-most non-zero column which shifts 3d points when projected # into the images, that is why projecting from velodyne to left and right use the same "extrinsics", and just differ # in the usage of the "intrinsic" matrices P2 and P3 # P2 and P3 might be decomposed so P2 = K2*T2 and P3=K3*T3, so T2 and T3 could host extrinsic information # while K2 and K3 could host the intrinsic information. This way, the pose of cam_left would be T2*R_rect*Tr_velo # However, such decomposition seems to be non-trivial. # x = P2 * R0_rect * Tr_velo_to_cam * y # x = P3 * R0_rect * Tr_velo_to_cam * y camera_rectification_4x4 = np.vstack((np.hstack( (camera_rectification_3x3, [[0], [0], [0]])), [0, 0, 0, 1])) transform_velo_to_camleft_4x4 = np.vstack( (transform_velo_to_camleft_3x4, [0, 0, 0, 1])) transform_velo_to_camleft_4x4 = np.dot( camera_rectification_4x4, transform_velo_to_camleft_4x4 ) # such that X_cam = transform_velo_to_cam_4x4 * X_velo # The pose of cameras can't be read from documentation, as these are virtual cameras created via a rectification # process, therefore, we need to build them using the velo_to_cam calibration # Pose_camLeft_wrt_ccs = RT_camLeft_to_ccs transform_lcs_to_camleft_4x4 = np.dot(transform_velo_to_camleft_4x4, transform_lcs_to_velo_4x4) pose_camleft_wrt_lcs_4x4 = utils.inv(transform_lcs_to_camleft_4x4) pose_camright_wrt_lcs_4x4 = pose_camleft_wrt_lcs_4x4 # Create cams and fill scene vcd.add_stream(stream_name="CAM_LEFT", uri="", description="Virtual Left color camera", stream_type=core.StreamType.camera) vcd.add_stream_properties( stream_name="CAM_LEFT", intrinsics=types.IntrinsicsPinhole(width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list( left_camera_K3x4.flatten()), distortion_coeffs_1xN=None), extrinsics=types.Extrinsics( pose_scs_wrt_lcs_4x4=list(pose_camleft_wrt_lcs_4x4.flatten()))) vcd.add_stream(stream_name="CAM_RIGHT", uri="", description="Virtual Right color camera", stream_type=core.StreamType.camera) vcd.add_stream_properties( stream_name="CAM_RIGHT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list(right_camera_K3x4.flatten()), distortion_coeffs_1xN=None), extrinsics=types.Extrinsics(pose_scs_wrt_lcs_4x4=list( pose_camright_wrt_lcs_4x4.flatten()))) ######################################### # ODOMETRY ######################################### oxts = [] for row in oxts_reader: row = row[0:len(row) - 1] floats = [float(i) for i in row] oxts.append(floats) '''lat_deg = row[0] # deg lon_deg = row[1] alt_deg = row[2] roll_rad = row[3] # 0 = level, positive = left side up (-pi..pi) pitch_rad = row[4] # 0 = level, positive = front down (-pi/2..pi/2) yaw_rad = row[5] # 0 = east, positive = counter clockwise (-pi..pi) vn = row[6] # velocity towards north(m / s) ve = row[7] # velocity towards east(m / s) vf = row[8] # forward velocity, i.e.parallel to earth - surface(m / s) vl = row[9] # leftward velocity, i.e.parallel to earth - surface(m / s) vu = row[10] # upward velocity, i.e.perpendicular to earth - surface(m / s) ax = row[11] # acceleration in x, i.e. in direction of vehicle front(m / s ^ 2) ay = row[12] # acceleration in y, i.e. in direction of vehicle left(m / s ^ 2) az = row[13] # acceleration in z, i.e. in direction of vehicle top(m / s ^ 2) af = row[14] # forward acceleration(m / s ^ 2) al = row[15] # leftward acceleration(m / s ^ 2) au = row[16] # upward acceleration(m / s ^ 2) wx = row[17] # angular rate around x(rad / s) wy = row[18] # angular rate around y(rad / s) wz = row[19] # angular rate around z(rad / s) wf = row[20] # angular rate around forward axis(rad / s) wl = row[21] # angular rate around leftward axis(rad / s) wu = row[22] # angular rate around upward axis(rad / s) posacc = row[23] # velocity accuracy(north / east in m) velacc = row[24] # velocity accuracy(north / east in m / s) navstat = row[25] # navigation status numsats = row[26] # number of satellites tracked by primary GPS receiver posmode = row[27] # position mode of primary GPS receiver velmode = row[28] # velocity mode of primary GPS receiver orimode = row[29] # orientation mode of primary GPS receiver ''' # Convert odometry (GPS) to poses odometry_4x4xN = utils.convert_oxts_to_pose(oxts) # An odometry entry is a 4x4 pose matrix of the lcs wrt wcs # poses_4x4xN_lcs_wrt_wcs = odometry_4x4xN frames_1xN = np.arange(0, odometry_4x4xN.shape[2], 1).reshape( (1, odometry_4x4xN.shape[2])) r, c = frames_1xN.shape for i in range(0, c): vcd.add_odometry( int(frames_1xN[0, i]), types.Odometry( pose_lcs_wrt_wcs_4x4=list(odometry_4x4xN[:, :, i].flatten()))) ######################################### # LABELS ######################################### for row in object_reader: frameNum = int(row[0]) trackID = int(row[1]) + 1 # VCD can't handle negative ids semantic_class = row[2] truncated = utils.float_2dec(float(row[3])) occluded = int(row[4]) alpha = utils.float_2dec(float(row[5])) left = utils.float_2dec(float(row[6])) top = utils.float_2dec(float(row[7])) width = utils.float_2dec(float(row[8]) - left) height = utils.float_2dec(float(row[9]) - top) bounding_box = types.bbox(name="", val=(left, top, width, height), stream='CAM_LEFT') dimHeight = utils.float_2dec(float(row[10])) dimWidth = utils.float_2dec(float(row[11])) dimLength = utils.float_2dec(float(row[12])) locX = utils.float_2dec(float(row[13])) locY = utils.float_2dec(float(row[14])) locZ = utils.float_2dec(float(row[15])) rotY = utils.float_2dec(float(row[16])) # Note KITTI uses (h, w, l, x, y, z, ry) for cuboids, in camera coordinates (X-to-right, Y-to-bottom, Z-to-front) # while in VCD (x,y,z, rx, ry, rz, sx, sy, sz) is defined as a dextrogire system # To express the cuboid in LCS (Local-Coordinate-System), we can add the pose of the camera # Cameras are 1.65 m height wrt ground # Cameras are 1.03 meters wrt to rear axle cam_wrt_rear_axle_z = 1.03 cam_height = 1.65 cuboid = types.cuboid( name="", val=(utils.float_2dec(locZ + cam_wrt_rear_axle_z), utils.float_2dec(-locX), utils.float_2dec(-locY + cam_height), 0, 0, utils.float_2dec(rotY), utils.float_2dec(dimWidth), utils.float_2dec(dimLength), utils.float_2dec(dimHeight))) # Note that if no "stream" parameter is given to this cuboid, LCS is assumed if not vcd.has(core.ElementType.object, trackID): vcd.add_object(name="", semantic_type=semantic_class, uid=trackID) vcd.add_object_data(trackID, bounding_box, frameNum) if semantic_class != "DontCare": vcd.add_object_data(trackID, cuboid, frameNum) vcd.add_object_data(trackID, types.num(name="truncated", val=truncated), frameNum) vcd.add_object_data(trackID, types.num(name="occluded", val=occluded), frameNum) vcd.add_object_data(trackID, types.num(name="alpha", val=alpha), frameNum) # Return return vcd
def simple_setup_4_cams_fisheye(): vcd = core.VCD() vcd.add_coordinate_system("vehicle-iso8855", cs_type=types.CoordinateSystemType.local_cs) # Let's build the cameras img_width_px = 1280 img_height_px = 966 cX = -0.302159995 cY = -3.44617009 ################################ # CAM_FRONT ################################ # Intrinsics # ------------------------------ # This matrix converts from scs to ics d_1x4 = np.array([333.437012, 0.307729989, 2.4235599, 11.0495005]) # Extrinsics # ------------------------------ # Extrinsics (1/2): Rotation using yaw(Z), pitch(Y), roll(X). This is rotation of SCS wrt LCS pitch_rad = (10.0 * np.pi) / 180.0 yaw_rad = (0.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([[2.3], # frontal part of the car [0.0], # centered in the symmetry axis of the car [1.3]]) # at some height over the ground P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) # As explained in core.py, a pose matrix of scs wrt lcs can be also seen as the inverse transformation from lcs to scs T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) # Extrinsics (2/2): Additional rotation, because usually cameras scs are (x-to-right, y-down, z-to-front) # to match better with ics (x-to-right, y-bottom); while the lcs is (x-to-front, y-to-left, z-up) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) # Now we can compose the final transform T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_FRONT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_FRONT", intrinsics=types.IntrinsicsFisheye( width_px=img_width_px, height_px=img_height_px, lens_coeffs_1x4=list(d_1x4.flatten()), center_x=cX, center_y=cY, fov_deg=0.0, radius_x=0.0, radius_y=0.0 ) ) vcd.add_coordinate_system("CAM_FRONT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_REAR ################################ # This matrix converts from scs to ics d_1x4 = np.array([333.437012, 0.307729989, 2.4235599, 11.0495005]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (180.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([[-0.725], # rear part of the car [0.0], # centered in the symmetry axis of the car [0.4]]) # at some height over the ground P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_REAR", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_REAR", intrinsics=types.IntrinsicsFisheye( width_px=img_width_px, height_px=img_height_px, lens_coeffs_1x4=list(d_1x4.flatten()), center_x=cX, center_y=cY, fov_deg=0.0, radius_x=0.0, radius_y=0.0 ) ) vcd.add_coordinate_system("CAM_REAR", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_LEFT ################################ # This matrix converts from scs to ics d_1x4 = np.array([333.437012, 0.307729989, 2.4235599, 11.0495005]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (90.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([[1.2], [0.6], # at the left [1.45]]) # at the roof P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_LEFT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_LEFT", intrinsics=types.IntrinsicsFisheye( width_px=img_width_px, height_px=img_height_px, lens_coeffs_1x4=list(d_1x4.flatten()), center_x=cX, center_y=cY, fov_deg=0.0, radius_x=0.0, radius_y=0.0 ) ) vcd.add_coordinate_system("CAM_LEFT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) ################################ # CAM_RIGHT ################################ # This matrix converts from scs to ics d_1x4 = np.array([333.437012, 0.307729989, 2.4235599, 11.0495005]) pitch_rad = (2.0 * np.pi) / 180.0 yaw_rad = (-90.0 * np.pi) / 180.0 roll_rad = (0.0 * np.pi) / 180.0 R_scs_wrt_lcs = utils.euler2R([yaw_rad, pitch_rad, roll_rad]) # default is ZYX C_lcs = np.array([[1.2], [-0.6], # at the left [1.45]]) # at the roof P_scs_wrt_lcs = utils.create_pose(R_scs_wrt_lcs, C_lcs) T_lcs_to_scs = utils.inv(P_scs_wrt_lcs) T_scs_to_scsics = np.array([[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) T_lcs_to_scs = T_scs_to_scsics.dot(T_lcs_to_scs) P_scs_wrt_lcs = utils.inv(T_lcs_to_scs) vcd.add_stream(stream_name="CAM_RIGHT", uri="", description="Virtual camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_RIGHT", intrinsics=types.IntrinsicsFisheye( width_px=img_width_px, height_px=img_height_px, lens_coeffs_1x4=list(d_1x4.flatten()), center_x=cX, center_y=cY, fov_deg=0.0, radius_x=0.0, radius_y=0.0 ) ) vcd.add_coordinate_system("CAM_RIGHT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list(P_scs_wrt_lcs.flatten())) return vcd
def parse_sequence_direct(self, seq_number): # This is a variant approach for creating a VCD 4.3.0 file reading the KITTI calibration files, # trying to avoid additional computation at this level, and exploiting the ability of VCD 4.3.0 to # express arbitrary transforms across coordinate systems vcd = core.VCD() ######################################### # OPEN files ######################################### calib_file_name = os.path.join(self.kitti_tracking_calib_path, str(seq_number).zfill(4) + ".txt") oxts_file_name = os.path.join(self.kitti_tracking_oxts_path, str(seq_number).zfill(4) + ".txt") object_file_name = os.path.join(self.kitti_tracking_objects_path, str(seq_number).zfill(4) + '.txt') calib_file = open(calib_file_name, newline='') oxts_file = open(oxts_file_name, newline='') object_file = open(object_file_name, newline='') calib_reader = csv.reader(calib_file, delimiter=' ') oxts_reader = csv.reader(oxts_file, delimiter=' ') object_reader = csv.reader(object_file, delimiter=' ') ######################################### # CREATE base coordinate system ######################################### # The main coordinate system for the scene "odom" represents a static cs (which coincides with first local cs). vcd.add_coordinate_system("odom", cs_type=types.CoordinateSystemType.scene_cs) ######################################### # CREATE vehicle coordinate system ######################################### # Local coordinate system, moving with the vehicle. Following iso8855 (x-front, y-left, z-up) vcd.add_coordinate_system("vehicle-iso8855", cs_type=types.CoordinateSystemType.local_cs, parent_name="odom") # Sensor coordinate systems are added # Add transforms for each time instant odometry_4x4xN = self.read_odometry_from_oxts(oxts_reader) # An odometry entry is a 4x4 pose matrix of the lcs wrt wcs (ergo a transform lcs_to_wcs) # poses_4x4xN_lcs_wrt_wcs = odometry_4x4xN frames_1xN = np.arange(0, odometry_4x4xN.shape[2], 1).reshape( (1, odometry_4x4xN.shape[2])) r, c = frames_1xN.shape for i in range(0, c): vcd.add_transform(int(frames_1xN[0, i]), transform=types.Transform( src_name="vehicle-iso8855", dst_name="odom", transform_src_to_dst_4x4=list( odometry_4x4xN[:, :, i].flatten()))) ######################################### # CREATE SENSORS coordinate system: LASER ######################################### # http://www.cvlibs.net/datasets/kitti/setup.php location_velo_wrt_vehicle_3x1 = np.array( [[0.76], [0.0], [1.73]]) # according to the documentation pose_velo_wrt_vehicle_4x4 = utils.create_pose( utils.identity(3), location_velo_wrt_vehicle_3x1) vcd.add_stream(stream_name="VELO_TOP", uri="", description="Velodyne roof", stream_type=core.StreamType.lidar) vcd.add_coordinate_system("VELO_TOP", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list( pose_velo_wrt_vehicle_4x4.flatten())) ######################################### # CREATE SENSORS coordinate system: GPS/IMU ######################################### # Let's build also the pose of the imu location_imu_wrt_vehicle_4x4 = np.array( [[-0.05], [0.32], [0.93]]) # according to documentation pose_imu_wrt_vehicle_4x4 = utils.create_pose( utils.identity(3), location_imu_wrt_vehicle_4x4) vcd.add_stream(stream_name="IMU", uri="", description="GPS/IMU", stream_type=core.StreamType.other) vcd.add_coordinate_system("IMU", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="vehicle-iso8855", pose_wrt_parent=list( pose_imu_wrt_vehicle_4x4.flatten())) ######################################### # CREATE SENSORS coordinate system: CAM ######################################### img_width_px = 1242 img_height_px = 375 # these are rectified dimensions calib_matrices = {} for row in calib_reader: calib_matrices[row[0]] = [ float(x) for x in row[1:] if len(x) > 0 ] # filter out some spaces at the end of the row # From KITTI readme.txt: # To project a point from Velodyne coordinates into the left color image, # you can use this formula: x = P2 * R0_rect * Tr_velo_to_cam * y # For the right color image: x = P3 * R0_rect * Tr_velo_to_cam * y left_camera_K3x4 = np.reshape(calib_matrices["P2:"], (3, 4)) right_camera_K3x4 = np.reshape(calib_matrices["P3:"], (3, 4)) camera_rectification_3x3 = np.reshape(calib_matrices["R_rect"], (3, 3)) transform_velo_to_camleft_3x4 = np.reshape( calib_matrices["Tr_velo_cam"], (3, 4)) # WRT to LEFT CAMERA ONLY camera_rectification_4x4 = np.vstack((np.hstack( (camera_rectification_3x3, [[0], [0], [0]])), [0, 0, 0, 1])) transform_velo_to_camleft_4x4 = np.vstack( (transform_velo_to_camleft_3x4, [0, 0, 0, 1])) transform_velo_to_camleft_4x4 = np.dot( camera_rectification_4x4, transform_velo_to_camleft_4x4 ) # such that X_cam = transform_velo_to_cam_4x4 * X_velo pose_camleft_wrt_velo_4x4 = utils.inv(transform_velo_to_camleft_4x4) vcd.add_stream(stream_name="CAM_LEFT", uri="", description="Virtual Left color camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_LEFT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list( left_camera_K3x4.flatten()), distortion_coeffs_1xN=None)) vcd.add_coordinate_system("CAM_LEFT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="VELO_TOP", pose_wrt_parent=list( pose_camleft_wrt_velo_4x4.flatten())) # Virtually, cam_left and cam_right are defined as the same coordinate systems, so their scs are the same # But their projection matrices (3x4) include a right-most non-zero column which shifts 3d points when projected # into the images, that is why projecting from velodyne to left and right use the same "extrinsics", and just differ # in the usage of the "intrinsic" matrices P2 and P3 # P2 and P3 might be decomposed so P2 = K2*T2 and P3=K3*T3, so T2 and T3 could host extrinsic information # while K2 and K3 could host the intrinsic information. This way, the pose of cam_left would be T2*R_rect*Tr_velo # However, such decomposition seems to be non-trivial. # x = P2 * R0_rect * Tr_velo_to_cam * y # x = P3 * R0_rect * Tr_velo_to_cam * y vcd.add_stream(stream_name="CAM_RIGHT", uri="", description="Virtual Right color camera", stream_type=core.StreamType.camera) vcd.add_stream_properties(stream_name="CAM_RIGHT", intrinsics=types.IntrinsicsPinhole( width_px=img_width_px, height_px=img_height_px, camera_matrix_3x4=list( right_camera_K3x4.flatten()), distortion_coeffs_1xN=None)) vcd.add_coordinate_system("CAM_RIGHT", cs_type=types.CoordinateSystemType.sensor_cs, parent_name="VELO_TOP", pose_wrt_parent=list( pose_camleft_wrt_velo_4x4.flatten())) ######################################### # LABELS ######################################### for row in object_reader: frameNum = int(row[0]) #trackID = int(row[1]) + 1 # VCD can't handle negative ids trackID = int(row[1]) #if trackID == 0: # continue # Let's ignore DontCare labels semantic_class = row[2] truncated = utils.float_2dec(float(row[3])) occluded = int(row[4]) alpha = utils.float_2dec(float( row[5])) # this is the observation angle (see cs_overview.pdf) left = utils.float_2dec(float(row[6])) top = utils.float_2dec(float(row[7])) width = utils.float_2dec(float(row[8]) - left) height = utils.float_2dec(float(row[9]) - top) if trackID == -1: # This is DontCare, there are multiple boxes count = vcd.get_element_data_count_per_type( core.ElementType.object, trackID, types.ObjectDataType.bbox, frameNum) name_box = "box2D" + str(count) else: name_box = "box2D" bounding_box = types.bbox(name=name_box, val=(left + width / 2, top + height / 2, width, height), coordinate_system='CAM_LEFT') # see cs_overview.pdf dimH = utils.float_2dec(float(row[10])) dimW = utils.float_2dec(float(row[11])) dimL = utils.float_2dec(float(row[12])) locX = utils.float_2dec(float(row[13])) locY = utils.float_2dec(float(row[14])) locZ = utils.float_2dec(float(row[15])) rotY = utils.float_2dec(float(row[16])) # Note KITTI uses (h, w, l, x, y, z, ry) for cuboids, in camera coordinates (X-to-right, Y-to-bottom, Z-to-front) # while in VCD (x, y, z, rx, ry, rz, sx, sy, sz) is defined as a dextrogire system, centroid-based # NOTE: changing locY by locY + dimH/2 as VCD uses centroid and KITTI uses bottom face # NOTE: All in Camera coordinate system # NOTE: x = length, y = height, z = width because of convention in readme.txt # The reference point for the 3D bounding box for each object is centered on the # bottom face of the box. The corners of bounding box are computed as follows with # respect to the reference point and in the object coordinate system: # x_corners = [l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2]^T # y_corners = [0, 0, 0, 0, -h, -h, -h, -h ]^T # z_corners = [w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2 ]^T # with l=length, h=height, and w=width. cuboid = types.cuboid(name="box3D", val=(utils.float_2dec(locX), utils.float_2dec(locY - dimH / 2), utils.float_2dec(locZ), 0, utils.float_2dec(rotY), 0, utils.float_2dec(dimL), utils.float_2dec(dimH), utils.float_2dec(dimW)), coordinate_system="CAM_LEFT") if not vcd.has(core.ElementType.object, str(trackID)): # First time if trackID >= 0: vcd.add_object(name=semantic_class + str(trackID), semantic_type=semantic_class, uid=str(trackID), frame_value=frameNum) else: # so this is for DontCare object vcd.add_object(name=semantic_class, semantic_type=semantic_class, uid=str(trackID), frame_value=frameNum) vcd.add_object_data(str(trackID), bounding_box, frameNum) vcd.add_object_data(str(trackID), cuboid, frameNum) vcd.add_object_data(trackID, types.num(name="truncated", val=truncated), frameNum) vcd.add_object_data(trackID, types.num(name="occluded", val=occluded), frameNum) vcd.add_object_data(trackID, types.num(name="alpha", val=alpha), frameNum) ######################################### # Ego-vehicle ######################################### vcd.add_object(name="Egocar", semantic_type="Egocar", uid=str(-2)) cuboid_ego = types.cuboid(name="box3D", val=(1.35, 0.0, 0.736, 0.0, 0.0, 0.0, 4.765, 1.82, 1.47), coordinate_system="vehicle-iso8855") vcd.add_object_data(str(-2), cuboid_ego) # Return return vcd