def rotate_image_on_pano(images, rotations, fov, output_shape): """Transform perspective images to equirectangular images after rotations. Return equirectangular panoramic images in which the input perspective images embedded in after the rotation R from the input images' frame to the target frame. The image with the field of view "fov" centered at camera's look-at -Z axis is projected onto the pano. The -Z axis corresponds to the spherical coordinates (pi/2, pi/2) which is (HEIGHT/2, WIDTH/4) on the pano. Args: images: [BATCH, HEIGHT, WIDTH, CHANNEL] perspective view images. rotations: [BATCH, 3, 3] rotations matrices. fov: (float) images' field of view in degrees. output_shape: a 2-D list of output dimension [height, width]. Returns: equirectangular images [BATCH, height, width, CHANNELS]. """ with tf.name_scope(None, 'rotate_image_on_pano', [images, rotations, fov, output_shape]): if len(images.shape) != 4: raise ValueError("'images' has the wrong dimensions.") if rotations.shape[-2:] != [3, 3]: raise ValueError("'rotations' must have 3x3 dimensions.") shape = images.shape.as_list() batch, height, width = shape[0], shape[1], shape[2] # Generate a mesh grid on a sphere. spherical = geometry.generate_equirectangular_grid(output_shape) cartesian = geometry.spherical_to_cartesian(spherical) cartesian = tf.tile(cartesian[tf.newaxis, :, :, :, tf.newaxis], [batch, 1, 1, 1, 1]) axis_convert = tf.constant([[0., -1., 0.], [0., 0., 1.], [1., 0., 0.]]) cartesian = tf.matmul(axis_convert, cartesian) cartesian = tf.squeeze( tf.matmul(rotations[:, tf.newaxis, tf.newaxis], cartesian), -1) # Only take one hemisphere. (camera lookat direction) hemisphere_mask = tf.cast(cartesian[:, :, :, -1:] < 0, tf.float32) image_coordinates = cartesian[:, :, :, :2] / cartesian[:, :, :, -1:] x, y = tf.split(image_coordinates, [1, 1], -1) # Map pixels on equirectangular pano to perspective image. nx = -x * width / (2 * tf.tan(math_utils.degrees_to_radians( fov / 2))) + width / 2 - 0.5 ny = y * height / (2 * tf.tan(math_utils.degrees_to_radians( fov / 2))) + height / 2 - 0.5 transformed = hemisphere_mask * tfa.image.resampler( images, tf.concat([nx, ny], -1)) return transformed
def rotate_image_in_3d(images, input_rotations, input_fov, output_fov, output_shape): """Return reprojected perspective view images given a rotated camera. This function applies a homography H = K_output * R^T * K_input' where K_output and K_input are the output and input camera intrinsics, R is the rotation from the input images' frame to the target frame. Args: images: [BATCH, HEIGHT, WIDTH, CHANNEL] perspective view images. input_rotations: [BATCH, 3, 3] rotations matrices from current camera frame to target camera frame. input_fov: [BATCH] a 1-D tensor (float32) of input field of view in degrees. output_fov: (float) output field of view in degrees. output_shape: a 2-D list of output dimension [height, width]. Returns: reprojected images [BATCH, height, width, CHANNELS]. """ with tf.name_scope( None, 'rotate_image_in_3d', [images, input_rotations, input_fov, output_fov, output_shape]): if len(images.shape) != 4: raise ValueError("'images' has the wrong dimensions.") if input_rotations.shape[-2:] != [3, 3]: raise ValueError("'input_rotations' must have 3x3 dimensions.") shape = images.shape.as_list() batch, height, width = shape[0], shape[1], shape[2] cartesian = geometry.generate_cartesian_grid(output_shape, output_fov) cartesian = tf.tile(cartesian[tf.newaxis, :, :, :, tf.newaxis], [batch, 1, 1, 1, 1]) input_rotations = tf.tile( input_rotations[:, tf.newaxis, tf.newaxis, :], [1] + output_shape + [1, 1]) cartesian = tf.squeeze( tf.matmul(input_rotations, cartesian, transpose_a=True), -1) image_coordinates = -cartesian[:, :, :, :2] / cartesian[:, :, :, -1:] x, y = tf.split(image_coordinates, [1, 1], -1) w = 2 * tf.tan(math_utils.degrees_to_radians(input_fov / 2)) h = 2 * tf.tan(math_utils.degrees_to_radians(input_fov / 2)) w = w[:, tf.newaxis, tf.newaxis, tf.newaxis] h = h[:, tf.newaxis, tf.newaxis, tf.newaxis] nx = x * width / w + width / 2 - 0.5 ny = -y * height / h + height / 2 - 0.5 return tfa.image.resampler(images, tf.concat([nx, ny], -1))
def generate_cartesian_grid(resolution, fov): """Get (x, y, z) coordinates of all pixel centres in the image. The image plane lies at z=-1 and the image center is (0, 0, -1). Args: resolution: a 2-D list containing the resolution (height, width) of the desired output. fov: (float) camera's horizontal field of view in degrees. Returns: 3-D tensor of shape `[HEIGHT, WIDTH, 3]` Raises: ValueError: 'resolution' is not valid. """ with tf.name_scope(None, 'generate_cartesian_grid', [resolution, fov]): if not isinstance(resolution, list) or len(resolution) != 2: raise ValueError("'resolution' is not valid.") fov = fov / 180 * math.pi width = 2 * tf.tan(fov / 2) height = width * resolution[0] / resolution[1] pixel_size = width / resolution[1] x_range = width - pixel_size y_range = height - pixel_size # x increases from left to right while y increases from bottom to top. # Use half-integer pixel centre convention, and generate the coordinates # for the centres of the pixels. # For example, a 2x3 grid with pixel_size=1 (height=2, width=3) should have # [(-1.0, 0.5), (0.0, 0.5), (1.0, 0.5), # (-1.0, -0.5), (0.0, -0.5), (1.0, -0.5)] xx, yy = tf.meshgrid( tf.lin_space(-x_range / 2, x_range / 2, resolution[1]), tf.lin_space(y_range / 2, -y_range / 2, resolution[0])) grid = tf.stack([xx, yy, -tf.ones_like(xx)], axis=-1) return grid