def get_position_signal(sequence_length, position_dim=8): """Return fixed position signal as sine waves. Sine waves frequencies are linearly spaced so that shortest is 2 and longest is half the maximum length. That way the longest frequency is long enough to be monotonous over the whole sequence length. Sine waves are also shifted so that they don't all start with the same value. We don't use learned positional embeddings because these embeddings are projected linearly along with the original embeddings, and the projection is learned. Args: sequence_length: int, T, length of the sequence.. position_dim: int, P, number of sine waves. Returns: A [T, P] tensor, position embeddings. """ # Compute the frequencies. periods = tf.exp( tf.lin_space(tf.log(2.0), tf.log(tf.to_float(sequence_length)), position_dim)) frequencies = 1.0 / periods # Shape [T, P]. # Compute the sine waves. xs = frequencies[None, :] * tf.to_float(tf.range(sequence_length)[:, None]) shifts = tf.lin_space(0.0, 2.0, position_dim)[None, :] # [1, P] positions = tf.math.cos(math.pi * (xs + shifts)) # [T, P] positions.shape.assert_is_compatible_with([sequence_length, position_dim]) return positions
def add_coord_channels(image_tensor): """Adds channels containing pixel indices (x and y coordinates) to an image. Note: This has nothing to do with keypoint coordinates. It is just a data augmentation to allow convolutional networks to learn non-translation- equivariant outputs. This is similar to the "CoordConv" layers: https://arxiv.org/abs/1603.09382. Args: image_tensor: [batch_size, H, W, C] tensor. Returns: [batch_size, H, W, C + 2] tensor with x and y coordinate channels. """ batch_size = tf.shape(image_tensor)[0] x_size = tf.shape(image_tensor)[2] y_size = tf.shape(image_tensor)[1] x_grid = tf.lin_space(-1.0, 1.0, x_size) x_map = tf.tile(x_grid[tf.newaxis, tf.newaxis, :, tf.newaxis], (batch_size, y_size, 1, 1)) y_grid = tf.lin_space(1.0, -1.0, y_size) y_map = tf.tile(y_grid[tf.newaxis, :, tf.newaxis, tf.newaxis], (batch_size, 1, x_size, 1)) return tf.concat([image_tensor, x_map, y_map], axis=-1)
def contrast_normalize(self, I): dist = tf.distributions.Normal(loc=0., scale=self.sigma) W = (self.kernel_size - 1) / 2.0 box_x = tf.lin_space(-W, W, self.kernel_size) box_y = tf.lin_space(-W, W, self.kernel_size) prob_x = dist.prob(box_x) prob_y = dist.prob(box_y) gaussian_box = tf.matmul(tf.reshape(prob_x, [self.kernel_size, 1]), tf.reshape(prob_y, [1, self.kernel_size])) gaussian_box = tf.reshape(gaussian_box, [self.kernel_size, self.kernel_size, 1, 1]) gaussian_box = tf.divide(gaussian_box, tf.reduce_sum(gaussian_box)) avg_I = tf.nn.conv2d(I, gaussian_box, strides=[1, 1, 1, 1], padding='SAME') normalized_I = I - avg_I img_H = tf.shape(I)[1] img_W = tf.shape(I)[2] normalized_I = tf.slice( normalized_I, [0, self.border, self.border, 0], [-1, img_H - 2 * self.border, img_W - 2 * self.border, -1]) return [normalized_I]
def get_random_scale(min_scale_factor, max_scale_factor, step_size): """Gets a random scale value. Args: min_scale_factor: Minimum scale value. max_scale_factor: Maximum scale value. step_size: The step size from minimum to maximum value. Returns: A random scale value selected between minimum and maximum value. Raises: ValueError: min_scale_factor has unexpected value. """ if min_scale_factor < 0 or min_scale_factor > max_scale_factor: raise ValueError('Unexpected value of min_scale_factor.') if min_scale_factor == max_scale_factor: return tf.to_float(min_scale_factor) # When step_size = 0, we sample the value uniformly from [min, max). if step_size == 0: return tf.random_uniform([1], minval=min_scale_factor, maxval=max_scale_factor) # When step_size != 0, we randomly select one discrete value from [min, max]. num_steps = int((max_scale_factor - min_scale_factor) / step_size + 1) scale_factors = tf.lin_space(min_scale_factor, max_scale_factor, num_steps) shuffled_scale_factors = tf.random_shuffle(scale_factors) return shuffled_scale_factors[0]
def camera_to_world_projection(depth, intrinsics, camera_to_world): """Project camera coordinates to world coordinates.""" # p_pixel: batch, w, h, 3 principal_point, fov 2-d list # r: batch, 3, 3 camera to world rotation # t: batch, 3 camera to world translation, depth: batch, w, h, 1 shape = depth.shape.as_list() height, width = shape[0], shape[1] xx, yy = tf.meshgrid(tf.lin_space(0., width - 1., width), tf.lin_space(0., height - 1., height)) p_pixel = tf.stack([xx, yy], axis=-1) p_pixel_homogeneous = tf.concat([p_pixel, tf.ones([height, width, 1])], -1) camera_to_world = tf.tile(camera_to_world[tf.newaxis, tf.newaxis, :], [height, width, 1, 1]) intrinsics = tf.tile(intrinsics[tf.newaxis, tf.newaxis, :], [height, width, 1, 1]) # Convert pixels coordinates (u, v, 1) to camera coordinates (x_c, y_c, f) # on the image plane. p_image = tf.squeeze( tf.matmul(tf.matrix_inverse(intrinsics), tf.expand_dims(p_pixel_homogeneous, -1)), -1) lookat_axis = tf.tile(tf.constant([0., 0., 1.], shape=[1, 1, 3]), [height, width, 1]) z = depth * tf.reduce_sum( tf.math.l2_normalize(p_image, axis=-1) * lookat_axis, axis=-1, keepdims=True) p_camera = z * p_image # convert from OpenCV convention to OpenGL p_camera = p_camera * tf.constant([1., 1., -1.], shape=[1, 1, 3]) p_camera_homogeneous = tf.concat( [p_camera, tf.ones(shape=[height, width, 1])], -1) # Convert camera coordinates to world coordinates. p_world = tf.squeeze( tf.matmul(camera_to_world, tf.expand_dims(p_camera_homogeneous, -1)), -1) return p_world
def image_to_world_projection(depth, intrinsics, pose_c2w): """Project points on the image to the world frame. Args: depth: [HEIGHT, WIDTH, 1] the depth map contains the radial distance from the camera eye to each point corresponding to each pixel. intrinsics: [3, 3] camera's intrinsic matrix. pose_c2w: [3, 4] camera pose matrix (camera to world). Returns: [HEIGHT, WIDTH, 3] points in the world's coordinate frame. """ shape = depth.shape.as_list() height, width = shape[0], shape[1] xx, yy = tf.meshgrid(tf.lin_space(0., width - 1., width), tf.lin_space(0., height - 1., height)) p_pixel_homogeneous = tf.concat( [tf.stack([xx, yy], axis=-1), tf.ones([height, width, 1])], -1) p_image = tf.squeeze( tf.matmul(tf.matrix_inverse(intrinsics[tf.newaxis, tf.newaxis, :]), tf.expand_dims(p_pixel_homogeneous, -1)), -1) z = depth * tf.reduce_sum( tf.math.l2_normalize(p_image, axis=-1) * tf.constant([[[0., 0., 1.]]]), axis=-1, keepdims=True) p_camera = z * p_image # convert to OpenGL coordinate system. p_camera = p_camera * tf.constant([1., 1., -1.], shape=[1, 1, 3]) p_camera_homogeneous = tf.concat( [p_camera, tf.ones(shape=[height, width, 1])], -1) # Convert camera coordinates to world coordinates. p_world = tf.squeeze( tf.matmul(pose_c2w[tf.newaxis, tf.newaxis, :], tf.expand_dims(p_camera_homogeneous, -1)), -1) return p_world
def generate_equirectangular_grid(shape): """Get spherical coordinates of an equirectangular grid. Args: shape: a list represents the (height, width) of the output. Returns: 3-D tensor of shape `[HEIGHT, WIDTH, 2]` Raises: ValueError: 'resolution' is not valid. """ with tf.name_scope(None, 'generate_equirectangular_grid', [shape]): if not isinstance(shape, list) or len(shape) != 2: raise ValueError("'shape' is not valid.") height, width = shape[0], shape[1] pixel_w = 2 * math.pi / float(width) pixel_h = math.pi / float(height) azimuth, colatitude = tf.meshgrid( tf.lin_space(pixel_w / 2, 2 * math.pi - pixel_w / 2, width), tf.lin_space(pixel_h / 2, math.pi - pixel_h / 2, height)) return tf.stack([colatitude, azimuth], axis=-1)
def when_nonsingular(): bucket_width = range_ / tf.cast(bucket_count, tf.float64) offsets = data - min_ bucket_indices = tf.cast(tf.floor(offsets / bucket_width), dtype=tf.int32) clamped_indices = tf.minimum(bucket_indices, bucket_count - 1) one_hots = tf.one_hot(clamped_indices, depth=bucket_count) bucket_counts = tf.cast(tf.reduce_sum(one_hots, axis=0), dtype=tf.float64) edges = tf.lin_space(min_, max_, bucket_count + 1) left_edges = edges[:-1] right_edges = edges[1:] return tf.transpose( tf.stack([left_edges, right_edges, bucket_counts]))
def generate_cartesian_grid(resolution, fov): """Get (x, y, z) coordinates of all pixel centres in the image. The image plane lies at z=-1 and the image center is (0, 0, -1). Args: resolution: a 2-D list containing the resolution (height, width) of the desired output. fov: (float) camera's horizontal field of view in degrees. Returns: 3-D tensor of shape `[HEIGHT, WIDTH, 3]` Raises: ValueError: 'resolution' is not valid. """ with tf.name_scope(None, 'generate_cartesian_grid', [resolution, fov]): if not isinstance(resolution, list) or len(resolution) != 2: raise ValueError("'resolution' is not valid.") fov = fov / 180 * math.pi width = 2 * tf.tan(fov / 2) height = width * resolution[0] / resolution[1] pixel_size = width / resolution[1] x_range = width - pixel_size y_range = height - pixel_size # x increases from left to right while y increases from bottom to top. # Use half-integer pixel centre convention, and generate the coordinates # for the centres of the pixels. # For example, a 2x3 grid with pixel_size=1 (height=2, width=3) should have # [(-1.0, 0.5), (0.0, 0.5), (1.0, 0.5), # (-1.0, -0.5), (0.0, -0.5), (1.0, -0.5)] xx, yy = tf.meshgrid( tf.lin_space(-x_range / 2, x_range / 2, resolution[1]), tf.lin_space(y_range / 2, -y_range / 2, resolution[0])) grid = tf.stack([xx, yy, -tf.ones_like(xx)], axis=-1) return grid
def generateCoords(inputShape): crop_size = inputShape[-2] firstDim = inputShape[0] Xcoords = tf.expand_dims(tf.lin_space(-1.0, 1.0, crop_size), axis=0) Xcoords = tf.tile(Xcoords, [crop_size, 1]) Ycoords = -1 * tf.transpose(Xcoords) #put -1 in the bottom of the table Xcoords = tf.expand_dims(Xcoords, axis=-1) Ycoords = tf.expand_dims(Ycoords, axis=-1) coords = tf.concat([Xcoords, Ycoords], axis=-1) coords = tf.expand_dims( coords, axis=0 ) #Add dimension to support batch size and nbRenderings should now be [1, 256, 256, 2]. coords = tf.tile( coords, [firstDim, 1, 1, 1]) #Add the proper dimension here for concat return coords
def equirectangular_area_weights(height): """Generate area weights for pixels in equirectangular images. This is to account for the area difference of pixels at different latitudes on equirectangular grids. Args: height: the height dimension of the equirectangular images. Returns: Area weighted with shape [1, HEIGHT, 1, 1]. """ with tf.name_scope(None, 'equirectangular_area_weights', [height]): pixel_h = math.pi / tf.cast(height, tf.float32) # Use half-integer pixel centre convention, and generate the spherical # coordinates for the centres of the pixels. colatitude = tf.lin_space(pixel_h / 2, math.pi - pixel_h / 2, height) colatitude = colatitude[tf.newaxis, :, tf.newaxis, tf.newaxis] return tf.sin(colatitude)
def generateSurfaceArray(crop_size, pixelsToAdd=0): totalSize = crop_size + (pixelsToAdd * 2) surfaceArray = [] XsurfaceArray = tf.expand_dims(tf.lin_space(-1.0, 1.0, totalSize), axis=0) XsurfaceArray = tf.tile(XsurfaceArray, [totalSize, 1]) YsurfaceArray = -1 * tf.transpose( XsurfaceArray) #put -1 in the bottom of the table XsurfaceArray = tf.expand_dims(XsurfaceArray, axis=-1) YsurfaceArray = tf.expand_dims(YsurfaceArray, axis=-1) surfaceArray = tf.concat([ XsurfaceArray, YsurfaceArray, tf.zeros([totalSize, totalSize, 1], dtype=tf.float32) ], axis=-1) surfaceArray = tf.expand_dims( tf.expand_dims(surfaceArray, axis=0), axis=0) #Add dimension to support batch size and nbRenderings return surfaceArray
def extract_glimpses(self, images, locations): """Extracts fovea-like glimpses. Args: images: 4-D Tensor of shape [batch, height, width, channels]. locations: 2D Tensor of shape [batch, 2] with glimpse locations. Locations are in the interval of [-1, 1] where points: (-1, -1): upper left corner. (-1, 1): upper right corner. (1, 1): lower right corner. (1, -1): lower left corner. Returns: glimpses: 5D tensor of size [batch, # glimpses, height, width, channels]. """ # Get multi resolution fields of view (first is full resolution) image_shape = tf.cast(tf.shape(images)[1:3], dtype=tf.float32) start = tf.cast(self.glimpse_shape[0], dtype=tf.float32) / image_shape[0] fields_of_view = tf.cast(tf.lin_space(start, 1., self.num_resolutions), dtype=tf.float32) receptive_fields = [self.glimpse_shape] + [ tf.cast(fields_of_view[i] * image_shape, dtype=tf.int32) for i in range(1, self.num_resolutions) ] images_glimpses_list = [] for field in receptive_fields: # Extract a glimpse with specific shape and scale. images_glimpse = utils.extract_glimpse(images, size=field, offsets=locations) # Bigger receptive fields have lower resolution. images_glimpse = tf.image.resize_images(images_glimpse, size=self.glimpse_shape) # Stop gradient if self.apply_stop_gradient: images_glimpse = tf.stop_gradient(images_glimpse) images_glimpses_list.append(images_glimpse) return images_glimpses_list
[(0, 0, 0), (1, 0, 0), (1, 1, 0), (2, 1, 0)] ] # L dataset = [np.array(points_) for points_ in tetris] num_classes = len(dataset) tf.disable_eager_execution() # In[20]: # radial basis functions rbf_low = 0.0 rbf_high = 3.5 rbf_count = 4 rbf_spacing = (rbf_high - rbf_low) / rbf_count centers = tf.cast(tf.lin_space(rbf_low, rbf_high, rbf_count), FLOAT_TYPE) # In[23]: # r : [N, 3] r = tf.placeholder(FLOAT_TYPE, shape=(4, 3)) # rij : [N, N, 3] rij = utils.difference_matrix(r) # dij : [N, N] dij = utils.distance_matrix(r) # rbf : [N, N, rbf_count] gamma = 1. / rbf_spacing rbf = tf.exp(-gamma * tf.square(tf.expand_dims(dij, axis=-1) - centers))