def homography_shift_mult(corner_shift1, w1, h1, corner_shift2, w2, h2, w, h): """Multiplies two homographies. Args: corner_shift1: a homography transformation parameterized as the displacement of four corner points. It is of data type float32 and of shape [8] w1: the width of the image where corner_shift1 is computed from h1: the height of the image where corner_shift1 is computed from corner_shift2: a homography transformation parameterized as the displacement of four corner points, with the same data type and shape as corner_shift1 w2: the width of the image where corner_shift2 is computed from h2: the height of the image where corner_shift2 is computed from w: the width of the image where the output corner_shift is computed from h: the height of the image where the output corner_shift is computed from Returns: the product of the two homographies of the same shape and data type as corner_shift1 """ hmg1 = shifts_to_homography(w1, h1, corner_shift1, is_forward=False, is_matrix=True) mat_scale1 = tf.reshape( tf.stack([ tf.to_float(w1) / tf.to_float(w), 0, 0, 0, tf.to_float(h1) / tf.to_float(h), 0, 0, 0, 1 ]), [3, 3]) mat1 = tf.matmul(tf.matrix_inverse(mat_scale1), tf.matmul(hmg1, mat_scale1)) hmg2 = shifts_to_homography(w2, h2, corner_shift2, is_forward=False, is_matrix=True) mat_scale2 = tf.reshape( tf.stack([ tf.to_float(w2) / tf.to_float(w), 0, 0, 0, tf.to_float(h2) / tf.to_float(h), 0, 0, 0, 1 ]), [3, 3]) mat2 = tf.matmul(tf.matrix_inverse(mat_scale2), tf.matmul(hmg2, mat_scale2)) hmg = tf.matrix_inverse(tf.matmul(mat1, mat2)) return homography_to_shifts(hmg, w, h, is_matrix=True)
def unprocess(self, image): with tf.name_scope(None, 'unprocess'): image.shape.assert_is_compatible_with([None, None, 3]) # Randomly creates image metadata. rgb2cam = unprocess.random_ccm() rgb_gain, red_gain, blue_gain = unprocess.random_gains() rgb2cam = tf.cond( self.is_train, true_fn=lambda: rgb2cam, false_fn=lambda: tf.where(tf.math.is_nan(self._ccm), rgb2cam, self._ccm)) rgb_gain = tf.cond( self.is_train, true_fn=lambda: rgb_gain, false_fn=lambda: tf.where(tf.math.is_nan(self._rgb_gain), rgb_gain, self._rgb_gain)) red_gain = tf.cond( self.is_train, true_fn=lambda: red_gain, false_fn=lambda: tf.where(tf.math.is_nan(self._red_gain), red_gain, self._red_gain)) blue_gain = tf.cond( self.is_train, true_fn=lambda: blue_gain, false_fn=lambda: tf.where(tf.math.is_nan(self._blue_gain), blue_gain, self._blue_gain)) cam2rgb = tf.matrix_inverse(rgb2cam) if self.simple_unprocessing: # Inverts gamma compression. image = unprocess.gamma_expansion(image) # Inverts color correction. image = unprocess.apply_ccm(image, rgb2cam) else: # Approximately inverts global tone mapping. image = unprocess.inverse_smoothstep(image) # Inverts gamma compression. image = unprocess.gamma_expansion(image) # Inverts color correction. image = unprocess.apply_ccm(image, rgb2cam) # Approximately inverts white balance and brightening. image = unprocess.safe_invert_gains(image, rgb_gain, red_gain, blue_gain) # Clips saturated pixels. image = tf.clip_by_value(image, 0.0, 1.0) # Applies a Bayer mosaic. bayer_image = unprocess.mosaic(image) metadata = { 'cam2rgb': cam2rgb, 'rgb_gain': rgb_gain, 'red_gain': red_gain, 'blue_gain': blue_gain, } return image, bayer_image, metadata
def get_constraint(nll, params): hessian = [ tf.gradients(g, params) for g in tf.unstack(tf.gradients(nll, params)) ] inverse = tf.matrix_inverse(hessian) covariance_poi = inverse[0][0] constraint = tf.sqrt(covariance_poi) return constraint
def get_metrics(links, idx): v_rot_idx = tf.gather(links.v_rot, idx) # all possible seg pairs V = v_rot_idx[newaxis] + v_rot_idx[:, newaxis] eps = 1e-9 eye_eps = -eps * tf.eye( links.points0.shape[-1], batch_shape=(1, 1), dtype=tf.float64) ginv = tf.to_double(tf.matmul(V, V, transpose_b=True)) + eye_eps gij = tf.to_float(tf.matrix_inverse(ginv)) return gij, ginv
def subpixel_homography(image, height, width, dy1, dx1, dy2, dx2, dy3, dx3, dy4, dx4): """Applies a homography to an image. Args: image: input image of shape [input_height, input_width, channels] and of data type uint8 or float32 height: the output image height width: the output image width dy1: the vertical shift of the top left corner dx1: the horizontal shift of the top left corner dy2: the vertical shift of the bottom left corner dx2: the horizontal shift of the bottom left corner dy3: the vertical shift of the top right corner dx3: the horizontal shift of the top right corner dy4: the vertical shift of the bottom right corner dx4: the horizontal shift of the bottom right corner Returns: the warping result of shape [height, width, channels] with the same data type as image """ rx1 = tf.cast(tf.stack([0, 0, 1, 0, 0, 0, 0, 0]), tf.float32) ry1 = tf.cast(tf.stack([0, 0, 0, 0, 0, 1, 0, 0]), tf.float32) rx2 = tf.cast( tf.stack([0, height - 1, 1, 0, 0, 0, 0, -(height - 1) * dx2]), tf.float32) ry2 = tf.cast( tf.stack([0, 0, 0, 0, height - 1, 1, 0, -(height - 1) * dy2]), tf.float32) rx3 = tf.cast(tf.stack([width - 1, 0, 1, 0, 0, 0, -(width - 1) * dx3, 0]), tf.float32) ry3 = tf.cast(tf.stack([0, 0, 0, width - 1, 0, 1, -(width - 1) * dy3, 0]), tf.float32) rx4 = tf.cast( tf.stack([ width - 1, height - 1, 1, 0, 0, 0, -(width - 1) * dx4, -(height - 1) * dx4 ]), tf.float32) ry4 = tf.cast( tf.stack([ 0, 0, 0, width - 1, height - 1, 1, -(width - 1) * dy4, -(height - 1) * dy4 ]), tf.float32) mat = tf.stack([rx1, ry1, rx2, ry2, rx3, ry3, rx4, ry4]) b = tf.reshape( tf.cast(tf.stack([dx1, dy1, dx2, dy2, dx3, dy3, dx4, dy4]), tf.float32), [8, 1]) inv_mat = tf.matrix_inverse(mat) transformation = tf.reshape(tf.matmul(inv_mat, b), [8]) warped = contrib_image.transform(image, transformation, 'bilinear') cropped = tf.image.crop_to_bounding_box(warped, 0, 0, height, width) return cropped
def image_overlap(depth1, pose1_c2w, depth2, pose2_c2w, intrinsics): """Determines the overlap of two images.""" pose1_w2c = tf.matrix_inverse( tf.concat([pose1_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] pose2_w2c = tf.matrix_inverse( tf.concat([pose2_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] p_world1 = camera_to_world_projection(depth1, intrinsics, pose1_c2w) p_image1_in_2, z1_c2 = world_to_camera_projection(p_world1, intrinsics, pose2_w2c) p_world2 = camera_to_world_projection(depth2, intrinsics, pose2_c2w) p_image2_in_1, z2_c1 = world_to_camera_projection(p_world2, intrinsics, pose1_w2c) shape = depth1.shape.as_list() height, width = shape[0], shape[1] height = tf.cast(height, tf.float32) width = tf.cast(width, tf.float32) mask_h2_in_1 = tf.logical_and( tf.less_equal(p_image2_in_1[:, :, 1], height), tf.greater_equal(p_image2_in_1[:, :, 1], 0.)) mask_w2_in_1 = tf.logical_and(tf.less_equal(p_image2_in_1[:, :, 0], width), tf.greater_equal(p_image2_in_1[:, :, 0], 0.)) mask2_in_1 = tf.logical_and(tf.logical_and(mask_h2_in_1, mask_w2_in_1), z2_c1 > 0) mask_h1_in_2 = tf.logical_and( tf.less_equal(p_image1_in_2[:, :, 1], height), tf.greater_equal(p_image1_in_2[:, :, 1], 0.)) mask_w1_in_2 = tf.logical_and(tf.less_equal(p_image1_in_2[:, :, 0], width), tf.greater_equal(p_image1_in_2[:, :, 0], 0.)) mask1_in_2 = tf.logical_and(tf.logical_and(mask_h1_in_2, mask_w1_in_2), z1_c2 > 0) return mask1_in_2, mask2_in_1
def orthonorm_op(x, epsilon=1e-7): ''' Computes a matrix that orthogonalizes the input matrix x x: an n x d input matrix eps: epsilon to prevent nonzero values in the diagonal entries of x returns: a d x d matrix, ortho_weights, which orthogonalizes x by right multiplication ''' x_2 = K.dot(K.transpose(x), x) x_2 += K.eye(K.int_shape(x)[1]) * epsilon L = tf.cholesky(x_2) ortho_weights = tf.transpose(tf.matrix_inverse(L)) * tf.sqrt( tf.cast(tf.shape(x)[0], dtype=K.floatx())) return ortho_weights
def _forward(length, angle_x, angle_y, T): """ Given a articulations it calculates the update to the coord matrix and the location of the end point in global coords. """ # update current transformation from local -> new local T_this = tf.matmul( _get_trans_mat_hom(-length), tf.matmul(_get_rot_mat_x_hom(-angle_x), _get_rot_mat_y_hom(-angle_y))) # trafo from global -> new local T = tf.matmul(T_this, T) # calculate global location of this point # x0 = tf.constant([[0.0], [0.0], [0.0], [1.0]]) s = length.get_shape().as_list() x0 = _to_hom(tf.zeros((s[0], 3, 1))) x = tf.matmul(tf.matrix_inverse(T), x0) return x, T
def format_network_input(self, ref_image, psv_src_images, ref_pose, psv_src_poses, planes, intrinsics): """Format the network input. Args: ref_image: reference source image [batch, height, width, 3] psv_src_images: stack of source images (excluding the ref image) [batch, height, width, 3*(num_source -1)] ref_pose: reference world-to-camera pose (where PSV is constructed) [batch, 4, 4] psv_src_poses: input poses (world to camera) [batch, num_source-1, 4, 4] planes: list of scalar depth values for each plane intrinsics: camera intrinsics [batch, 3, 3] Returns: net_input: [batch, height, width, #planes, num_source*3] """ _, num_psv_source, _, _ = psv_src_poses.get_shape().as_list() num_planes = tf.shape(planes)[0] net_input = [] for i in range(num_psv_source): curr_pose = tf.matmul(psv_src_poses[:, i], tf.matrix_inverse(ref_pose)) curr_image = psv_src_images[:, :, :, i * 3:(i + 1) * 3] curr_psv = pj.plane_sweep(curr_image, planes, curr_pose, intrinsics) net_input.append(curr_psv) net_input = tf.concat(net_input, axis=4) ref_img_stack = tf.tile(tf.expand_dims(ref_image, 3), [1, 1, 1, num_planes, 1]) net_input = tf.concat([net_input, ref_img_stack], axis=4) # Append normalized plane indices normalized_disp_inds = tf.reshape(tf.linspace(0.0, 1.0, num_planes), [1, 1, 1, num_planes, 1]) sh = tf.shape(net_input) normalized_disp_inds_stack = tf.tile(normalized_disp_inds, [1, sh[1], sh[2], 1, 1]) net_input = tf.concat([net_input, normalized_disp_inds_stack], axis=4) return net_input
def inv_homography_dmat(k_t, rot, t, n_hat, a): """Computes M where M*(u,v,1) = d_t. Args: k_t: intrinsics for target cameras, are [...] X 3 X 3 matrices rot: relative rotation, are [...] X 3 X 3 matrices t: [...] X 3 X 1, translations from source to target camera n_hat: [...] X 1 X 3, plane normal w.r.t source camera frame a: [...] X 1 X 1, plane equation displacement Returns: d_mat: [...] X 1 X 3 matrices """ with tf.name_scope('inv_homography'): rot_t = _transpose(rot) k_t_inv = tf.matrix_inverse(k_t, name='k_t_inv') denom = a - tf.matmul(tf.matmul(n_hat, rot_t), t) d_mat = divide_safe( -1 * tf.matmul(tf.matmul(n_hat, rot_t), k_t_inv), denom, name='dmat') return d_mat
def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True): """Transforms coordinates in the pixel frame to the camera frame. Args: depth: [batch, height, width] pixel_coords: homogeneous pixel coordinates [batch, 3, height, width] intrinsics: camera intrinsics [batch, 3, 3] is_homogeneous: return in homogeneous coordinates Returns: Coords in the camera frame [batch, 3 (4 if homogeneous), height, width] """ batch, height, width = depth.get_shape().as_list() depth = tf.reshape(depth, [batch, 1, -1]) pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1]) cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth if is_homogeneous: ones = tf.ones([batch, 1, height * width]) cam_coords = tf.concat([cam_coords, ones], axis=1) cam_coords = tf.reshape(cam_coords, [batch, -1, height, width]) return cam_coords
def latent_loss(self, prior): """ Analytic expression for latent loss which can be used when posterior and prior are Gaussian https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback%E2%80%93Leibler_divergence :param prior: Vertexwise Prior instance which defines the ``mean`` and ``cov`` vertices attributes """ prior_cov_inv = tf.matrix_inverse(prior.cov) mean_diff = tf.subtract(self.mean, prior.mean) term1 = tf.trace(tf.matmul(prior_cov_inv, self.cov)) term2 = tf.matmul(tf.reshape(mean_diff, (self.nvertices, 1, -1)), prior_cov_inv) term3 = tf.reshape(tf.matmul(term2, tf.reshape(mean_diff, (self.nvertices, -1, 1))), [self.nvertices]) term4 = prior.log_det_cov() term5 = self.log_det_cov() return self.log_tf(tf.identity(0.5*(term1 + term3 - self.nparams + term4 - term5), name="%s_latent_loss" % self.name))
def calc_homography_from_points(src_points, dst_points, is_matrix=True): """Computes a homography from four pairs of corresponding points. Args: src_points: source points of shape [4, 2] and of data type float32 or int32 dst_points: target points of shape [4, 2] and of data type float32 or int32 is_matrix: whether represent the final homography using matrix or vector Returns: the output homography of data type float32. If is_matrix is True, it is of shape [3, 3]; otherwise [8] """ mat_elements = [] r_vec_elements = [] for i in range(0, 4): rx = tf.to_float( tf.stack([ src_points[i, 0], src_points[i, 1], 1, 0, 0, 0, -dst_points[i, 0] * src_points[i, 0], -dst_points[i, 0] * src_points[i, 1] ])) ry = tf.to_float( tf.stack([ 0, 0, 0, src_points[i, 0], src_points[i, 1], 1, -dst_points[i, 1] * src_points[i, 0], -dst_points[i, 1] * src_points[i, 1] ])) mat_elements.append(rx) mat_elements.append(ry) r_vec_elements.append(dst_points[i, 0]) r_vec_elements.append(dst_points[i, 1]) mat = tf.stack(mat_elements) r_vec = tf.reshape(tf.to_float(tf.stack(r_vec_elements)), [8, 1]) inv_mat = tf.matrix_inverse(mat) transform = tf.reshape(tf.matmul(inv_mat, r_vec), [8]) if is_matrix: hmg = tf.reshape(tf.concat([transform, [1.0]], 0), [3, 3]) else: hmg = transform return hmg
def camera_to_world_projection(depth, intrinsics, camera_to_world): """Project camera coordinates to world coordinates.""" # p_pixel: batch, w, h, 3 principal_point, fov 2-d list # r: batch, 3, 3 camera to world rotation # t: batch, 3 camera to world translation, depth: batch, w, h, 1 shape = depth.shape.as_list() height, width = shape[0], shape[1] xx, yy = tf.meshgrid(tf.lin_space(0., width - 1., width), tf.lin_space(0., height - 1., height)) p_pixel = tf.stack([xx, yy], axis=-1) p_pixel_homogeneous = tf.concat([p_pixel, tf.ones([height, width, 1])], -1) camera_to_world = tf.tile(camera_to_world[tf.newaxis, tf.newaxis, :], [height, width, 1, 1]) intrinsics = tf.tile(intrinsics[tf.newaxis, tf.newaxis, :], [height, width, 1, 1]) # Convert pixels coordinates (u, v, 1) to camera coordinates (x_c, y_c, f) # on the image plane. p_image = tf.squeeze( tf.matmul(tf.matrix_inverse(intrinsics), tf.expand_dims(p_pixel_homogeneous, -1)), -1) lookat_axis = tf.tile(tf.constant([0., 0., 1.], shape=[1, 1, 3]), [height, width, 1]) z = depth * tf.reduce_sum( tf.math.l2_normalize(p_image, axis=-1) * lookat_axis, axis=-1, keepdims=True) p_camera = z * p_image # convert from OpenCV convention to OpenGL p_camera = p_camera * tf.constant([1., 1., -1.], shape=[1, 1, 3]) p_camera_homogeneous = tf.concat( [p_camera, tf.ones(shape=[height, width, 1])], -1) # Convert camera coordinates to world coordinates. p_world = tf.squeeze( tf.matmul(camera_to_world, tf.expand_dims(p_camera_homogeneous, -1)), -1) return p_world
def image_to_world_projection(depth, intrinsics, pose_c2w): """Project points on the image to the world frame. Args: depth: [HEIGHT, WIDTH, 1] the depth map contains the radial distance from the camera eye to each point corresponding to each pixel. intrinsics: [3, 3] camera's intrinsic matrix. pose_c2w: [3, 4] camera pose matrix (camera to world). Returns: [HEIGHT, WIDTH, 3] points in the world's coordinate frame. """ shape = depth.shape.as_list() height, width = shape[0], shape[1] xx, yy = tf.meshgrid(tf.lin_space(0., width - 1., width), tf.lin_space(0., height - 1., height)) p_pixel_homogeneous = tf.concat( [tf.stack([xx, yy], axis=-1), tf.ones([height, width, 1])], -1) p_image = tf.squeeze( tf.matmul(tf.matrix_inverse(intrinsics[tf.newaxis, tf.newaxis, :]), tf.expand_dims(p_pixel_homogeneous, -1)), -1) z = depth * tf.reduce_sum( tf.math.l2_normalize(p_image, axis=-1) * tf.constant([[[0., 0., 1.]]]), axis=-1, keepdims=True) p_camera = z * p_image # convert to OpenGL coordinate system. p_camera = p_camera * tf.constant([1., 1., -1.], shape=[1, 1, 3]) p_camera_homogeneous = tf.concat( [p_camera, tf.ones(shape=[height, width, 1])], -1) # Convert camera coordinates to world coordinates. p_world = tf.squeeze( tf.matmul(pose_c2w[tf.newaxis, tf.newaxis, :], tf.expand_dims(p_camera_homogeneous, -1)), -1) return p_world
def inv_homography(k_s, k_t, rot, t, n_hat, a): """Computes inverse homography matrix. Args: k_s: intrinsics for source cameras, are [...] X 3 X 3 matrices k_t: intrinsics for target cameras, are [...] X 3 X 3 matrices rot: relative rotation, are [...] X 3 X 3 matrices t: [...] X 3 X 1, translations from source to target camera n_hat: [...] X 1 X 3, plane normal w.r.t source camera frame a: [...] X 1 X 1, plane equation displacement Returns: homography: [...] X 3 X 3 inverse homography matrices """ with tf.name_scope('inv_homography'): rot_t = _transpose(rot) k_t_inv = tf.matrix_inverse(k_t, name='k_t_inv') denom = a - tf.matmul(tf.matmul(n_hat, rot_t), t) numerator = tf.matmul(tf.matmul(tf.matmul(rot_t, t), n_hat), rot_t) inv_hom = tf.matmul( tf.matmul(k_s, rot_t + divide_safe(numerator, denom)), k_t_inv, name='inv_hom') return inv_hom
def tf_rotation_resampling(voxel_array, transformation_matrix, params, Scale_matrix = None, size=64, new_size=128): """ Batch transformation and resampling function :param voxel_array: batch of voxels. Shape = [batch_size, height, width, depth, features] :param transformation_matrix: Rotation matrix. Shape = [batch_size, height, width, depth, features] :param size: original size of the voxel array :param new_size: size of the resampled array :return: transformed voxel array """ batch_size = tf.shape(voxel_array)[0] n_channels = voxel_array.get_shape()[4].value target = tf.zeros([ batch_size, new_size, new_size, new_size]) #Aligning the centroid of the object (voxel grid) to origin for rotation, #then move the centroid back to the original position of the grid centroid T = tf.constant([[1,0,0, -size * 0.5], [0,1,0, -size * 0.5], [0,0,1, -size * 0.5], [0,0,0,1]]) # add one more dimension to T and then tile T = tf.tile(tf.reshape(T, (1, 4, 4)), [batch_size, 1, 1]) # However, since the rotated grid might be out of bound for the original grid size, # move the rotated grid to a new bigger grid T_new_inv = tf.constant([[1, 0, 0, new_size * 0.5], [0, 1, 0, new_size * 0.5], [0, 0, 1, new_size * 0.5], [0, 0, 0, 1]]) T_new_inv = tf.tile(tf.reshape(T_new_inv, (1, 4, 4)), [batch_size, 1, 1]) # Add the actual shifting in x and y dimension accoding to input param x_shift = tf.reshape(params[:, 3], (batch_size, 1, 1)) y_shift = tf.reshape(params[:, 4], (batch_size, 1, 1)) z_shift = tf.reshape(params[:, 5], (batch_size, 1, 1)) # ======================================================== # Because tensorflow does not allow tensor item replacement # A new matrix needs to be created from scratch by concatenating different vectors into rows and stacking them up ones = tf.ones_like(x_shift) zeros = tf.zeros_like(x_shift) T_translate = tf.concat([ tf.concat([ones, zeros, zeros, x_shift], axis=2), tf.concat([zeros, ones, zeros, y_shift], axis=2), tf.concat([zeros, zeros, ones, z_shift], axis=2), tf.concat([zeros, zeros, zeros, ones], axis=2)], axis=1) total_M = tf.matmul(tf.matmul(tf.matmul(tf.matmul(T_new_inv, T_translate), Scale_matrix), transformation_matrix), T) try: total_M = tf.matrix_inverse(total_M) total_M = total_M[:, 0:3, :] #Ignore the homogenous coordinate so the results are 3D vectors. shape: (batch * 3 * 4) grid = tf_voxel_meshgrid(new_size, new_size, new_size, homogeneous=True) # here you created new_size^3 grid, but the T matrix just translate this by size * 0.5, this will not align the grid to origin point # shape: (4 * new_size^3), here 4 is 3 + homogeneous=True grid = tf.tile(tf.reshape(grid, (1, tf.to_int32(grid.get_shape()[0]), tf.to_int32(grid.get_shape()[1]))), [batch_size, 1, 1]) grid_transform = tf.matmul(total_M, grid) # (batch * 3 * 4) matmul (batch * 4 * new_size^3) is (3 * 4) matmul (4 * new_size^3) along batch x_s_flat = tf.reshape(grid_transform[:, 0, :], [-1]) y_s_flat = tf.reshape(grid_transform[:, 1, :], [-1]) z_s_flat = tf.reshape(grid_transform[:, 2, :], [-1]) input_transformed = tf_interpolate(voxel_array, x_s_flat, y_s_flat, z_s_flat, [batch_size, new_size, new_size, new_size, n_channels]) target = tf.reshape(input_transformed, [batch_size, new_size, new_size, new_size, n_channels]) return target, grid_transform except tf.InvalidArgumentError: return None
def multihead_invertible_1x1_conv_np(name, x, x_mask, multihead_split, inverse, dtype): """Multi-head 1X1 convolution on x.""" batch_size, length, n_channels_all = common_layers.shape_list(x) assert n_channels_all % 32 == 0 n_channels = 32 n_1x1_heads = n_channels_all // n_channels def get_init_np(): """Initializer function for multihead 1x1 parameters using numpy.""" results = [] for _ in range(n_1x1_heads): random_matrix = np.random.rand(n_channels, n_channels) np_w = scipy.linalg.qr(random_matrix)[0].astype("float32") np_p, np_l, np_u = scipy.linalg.lu(np_w) np_s = np.diag(np_u) np_sign_s = np.sign(np_s)[np.newaxis, :] np_log_s = np.log(np.abs(np_s))[np.newaxis, :] np_u = np.triu(np_u, k=1) results.append( np.concatenate([np_p, np_l, np_u, np_sign_s, np_log_s], axis=0)) return tf.convert_to_tensor(np.stack(results, axis=0)) def get_mask_init(): ones = tf.ones([n_1x1_heads, n_channels, n_channels], dtype=dtype) l_mask = tf.matrix_band_part(ones, -1, 0) - tf.matrix_band_part( ones, 0, 0) u_mask = tf.matrix_band_part(ones, 0, -1) - tf.matrix_band_part( ones, 0, 0) return tf.stack([l_mask, u_mask], axis=0) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): params = tf.get_variable("params", initializer=get_init_np, dtype=dtype) mask_params = tf.get_variable("mask_params", initializer=get_mask_init, dtype=dtype, trainable=False) p = tf.stop_gradient(params[:, :n_channels, :]) l = params[:, n_channels:2 * n_channels, :] u = params[:, 2 * n_channels:3 * n_channels, :] sign_s = tf.stop_gradient(params[:, 3 * n_channels, :]) log_s = params[:, 3 * n_channels + 1, :] l_mask = mask_params[0] u_mask = mask_params[1] l_diag = l * l_mask + (tf.eye( n_channels, n_channels, [n_1x1_heads], dtype=dtype)) u_diag = u * u_mask + (tf.matrix_diag(sign_s * tf.exp(log_s))) w = tf.matmul(p, tf.matmul(l_diag, u_diag)) if multihead_split == "a": x = tf.reshape(x, [batch_size, length, n_channels, n_1x1_heads]) x = tf.transpose(x, [3, 0, 1, 2]) elif multihead_split == "c": x = tf.reshape(x, [batch_size, length, n_1x1_heads, n_channels]) x = tf.transpose(x, [2, 0, 1, 3]) else: raise ValueError("Multihead split not supported.") # [n_1x1_heads, batch_size, length, n_channels] if not inverse: # [n_1x1_heads, 1, n_channels, n_channels] x = tf.matmul(x, w[:, tf.newaxis, :, :]) else: w_inv = tf.matrix_inverse(w) x = tf.matmul(x, w_inv[:, tf.newaxis, :, :]) if multihead_split == "a": x = tf.transpose(x, [1, 2, 3, 0]) x = tf.reshape(x, [batch_size, length, n_channels * n_1x1_heads]) elif multihead_split == "c": x = tf.transpose(x, [1, 2, 0, 3]) x = tf.reshape(x, [batch_size, length, n_1x1_heads * n_channels]) else: raise ValueError("Multihead split not supported.") x_length = tf.reduce_sum(x_mask, -1) logabsdet = x_length * tf.reduce_sum(log_s) if inverse: logabsdet *= -1 return x, logabsdet
""" dataset_without_bias = pd.read_csv(FILE_PATH, header=None) num_examples = dataset_without_bias.shape[0] dataset = np.c_[np.ones((num_examples, 1)), dataset_without_bias] # The two dataset without different labels data_label_0, data_label_1 = split_data(dataset) # Define the formulas, proved that this can drastically drag the performance # m is the number of attributes m = dataset.shape[1] - 1 # don't count the label itself X = tf.placeholder(tf.float64, shape=(None, m), name='X') t = tf.placeholder(tf.float64, shape=(None, 1), name='t') n = tf.placeholder(tf.float64, name='n') XT = tf.transpose(X) w = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), t) y = tf.matmul(X, w) MSE = tf.div(tf.matmul(tf.transpose(y - t), y - t), n) w_star = tf.placeholder(tf.float64, shape=(m, 1), name="w_star") y_test = tf.matmul(X, w_star) y_test_predicted = tf.round(y_test) MSE_test = tf.abs(y_test_predicted - t) for sample_size in range(40, 201, 40): accuracy_rate = [] for exp in range(NUM_EXP): training_attr, training_label, test_attr, test_label = generate_training_and_test_dataset(
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 0 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes*kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes*kp_uv_entries kp_vis_entries = self.num_kp record_bytes += encoding_bytes*kp_vis_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read(tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*kp_xyz_entries keypoint_xyz21 /= 1000.0 # scale to meters keypoint_xyz21 = self.convert_kp(keypoint_xyz21) # calculate wrist coord if self.use_wrist_coord: wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :]) keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0), keypoint_xyz21[1:, :]], 0) data_dict['keypoint_xyz21'] = keypoint_xyz21 # 2. Read keypoint uv AND VIS keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries) keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21) keypoint_uv21 = keypoint_uv_vis21[:, :2] keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0) # calculate wrist vis if self.use_wrist_coord: wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0]) keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0), keypoint_vis21[1:]], 0) wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) data_dict['keypoint_vis21'] = keypoint_vis21 if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict['keypoint_uv21'] = keypoint_uv21 # decode to uint8 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image """ CONSTANTS """ # Camera intrinsics sx = 822.79041 sy = 822.79041 tx = 318.47345 ty = 250.31296 data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]]) # Hand side: this dataset only contains left hands data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: XYZ represenations. """ # make coords relative to root joint kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([2, ]) if self.crop_center_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2)) if not self.use_wrist_coord: wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2 keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [1, ]) scale_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [1, ]) trans2 = tf.reshape(trans2, [1, ]) trans_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat'])) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.random_crop_to_size: tensor_stack = tf.concat([data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32)], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop(tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict() # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def read_data(self): """Provides images and camera intrinsics.""" with tf.name_scope('data_loading'): with tf.name_scope('enqueue_paths'): seed = random.randint(0, 2**31 - 1) self.file_lists = self.compile_file_list(self.data_dir, self.input_file) image_paths_queue = tf.train.string_input_producer( self.file_lists['image_file_list'], seed=seed, shuffle=self.shuffle, num_epochs=(1 if not self.shuffle else None) ) seg_paths_queue = tf.train.string_input_producer( self.file_lists['segment_file_list'], seed=seed, shuffle=self.shuffle, num_epochs=(1 if not self.shuffle else None)) cam_paths_queue = tf.train.string_input_producer( self.file_lists['cam_file_list'], seed=seed, shuffle=self.shuffle, num_epochs=(1 if not self.shuffle else None)) img_reader = tf.WholeFileReader() _, image_contents = img_reader.read(image_paths_queue) seg_reader = tf.WholeFileReader() _, seg_contents = seg_reader.read(seg_paths_queue) if self.file_extension == 'jpg': image_seq = tf.image.decode_jpeg(image_contents) seg_seq = tf.image.decode_jpeg(seg_contents, channels=3) elif self.file_extension == 'png': image_seq = tf.image.decode_png(image_contents, channels=3) seg_seq = tf.image.decode_png(seg_contents, channels=3) with tf.name_scope('load_intrinsics'): cam_reader = tf.TextLineReader() _, raw_cam_contents = cam_reader.read(cam_paths_queue) rec_def = [] for _ in range(9): rec_def.append([1.0]) raw_cam_vec = tf.decode_csv(raw_cam_contents, record_defaults=rec_def) raw_cam_vec = tf.stack(raw_cam_vec) intrinsics = tf.reshape(raw_cam_vec, [3, 3]) with tf.name_scope('convert_image'): image_seq = self.preprocess_image(image_seq) # Converts to float. if self.random_color: with tf.name_scope('image_augmentation'): image_seq = self.augment_image_colorspace(image_seq) image_stack = self.unpack_images(image_seq) seg_stack = self.unpack_images(seg_seq) if self.flipping_mode != FLIP_NONE: random_flipping = (self.flipping_mode == FLIP_RANDOM) with tf.name_scope('image_augmentation_flip'): image_stack, seg_stack, intrinsics = self.augment_images_flip( image_stack, seg_stack, intrinsics, randomized=random_flipping) if self.random_scale_crop: with tf.name_scope('image_augmentation_scale_crop'): image_stack, seg_stack, intrinsics = self.augment_images_scale_crop( image_stack, seg_stack, intrinsics, self.img_height, self.img_width) with tf.name_scope('multi_scale_intrinsics'): intrinsic_mat = self.get_multi_scale_intrinsics(intrinsics, self.num_scales) intrinsic_mat.set_shape([self.num_scales, 3, 3]) intrinsic_mat_inv = tf.matrix_inverse(intrinsic_mat) intrinsic_mat_inv.set_shape([self.num_scales, 3, 3]) if self.imagenet_norm: im_mean = tf.tile( tf.constant(IMAGENET_MEAN), multiples=[self.seq_length]) im_sd = tf.tile( tf.constant(IMAGENET_SD), multiples=[self.seq_length]) image_stack_norm = (image_stack - im_mean) / im_sd else: image_stack_norm = image_stack with tf.name_scope('batching'): if self.shuffle: (image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv) = tf.train.shuffle_batch( [image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv], batch_size=self.batch_size, num_threads=self.threads, capacity=self.queue_size + QUEUE_BUFFER * self.batch_size, min_after_dequeue=self.queue_size) else: (image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv) = tf.train.batch( [image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv], batch_size=self.batch_size, num_threads=1, capacity=self.queue_size + QUEUE_BUFFER * self.batch_size) return (image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv)
def render_envmap(self, cubes, cube_centers, cube_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res): """Render environment map from volumetric lights. Args: cubes: input list of cubes in multiscale volume cube_centers: position of cube centers cube_side_lengths: side lengths of cubes cube_rel_shapes: size of "footprint" of each cube within next coarser cube cube_nest_inds: indices for cube "footprints" ref_pose: c2w pose of ref camera env_pose: c2w pose of environment map camera theta_res: resolution of theta (width) for environment map phi_res: resolution of phi (height) for environment map r_res: number of spherical shells to sample for environment map rendering Returns: An environment map at the input pose """ num_scales = len(cubes) env_c2w = env_pose env2ref = tf.matmul(tf.matrix_inverse(ref_pose), env_c2w) # cube-->sphere resampling all_shells_list = [] all_rad_list = [] for i in range(num_scales): if i == num_scales - 1: # "finest" resolution cube, don't zero out cube_removed = cubes[i] else: # zero out areas covered by finer resolution cubes cube_shape = cubes[i].get_shape().as_list()[1] zm_y, zm_x, zm_z = tf.meshgrid( tf.range(cube_nest_inds[i][0], cube_nest_inds[i][0] + cube_rel_shapes[i]), tf.range(cube_nest_inds[i][1], cube_nest_inds[i][1] + cube_rel_shapes[i]), tf.range(cube_nest_inds[i][2], cube_nest_inds[i][2] + cube_rel_shapes[i]), indexing='ij') inds = tf.stack([zm_y, zm_x, zm_z], axis=-1) updates = tf.to_float(tf.ones_like(zm_x)) zero_mask = 1.0 - tf.scatter_nd( inds, updates, shape=[cube_shape, cube_shape, cube_shape]) cube_removed = zero_mask[tf.newaxis, :, :, :, tf.newaxis] * cubes[i] spheres_i, rad_i = pj.spherical_cubevol_resample( cube_removed, env2ref, cube_centers[i], cube_side_lengths[i], phi_res, theta_res, r_res) all_shells_list.append(spheres_i) all_rad_list.append(rad_i) all_shells = tf.concat(all_shells_list, axis=3) all_rad = tf.concat(all_rad_list, axis=0) all_shells = pj.interleave_shells(all_shells, all_rad) all_shells_envmap = pj.over_composite(all_shells) return all_shells_envmap, all_shells_list
def augment_seqs_ava(raw_frames, num_frame, max_shift, batch_size=2, queue_size=60, num_threads=3, train_height=128, train_width=128, pixel_noise=0.0, mix=True, screen=False, mode='train', to_gray=True): """Prepares training sequence batches from AVA dataset. Args: raw_frames: input video frames from AVA dataset num_frame: the number of frames in a sequence max_shift: the range each image corner point can move batch_size: the size of training or testing batches queue_size: the queue size of the shuffle buffer num_threads: the number of threads of the shuffle buffer train_height: the height of the training/testing images train_width: the width of the training/testing images pixel_noise: the magnitude of additive noises mix: whether mix the magnitude of corner point shifts screen: whether remove highly distorted homographies mode: 'train' or 'eval', specifying whether preparing images for training or testing to_gray: whether prepare color or gray scale training images Returns: a batch of training images and the corresponding ground-truth homographies """ if to_gray: output_frames = tf.image.rgb_to_grayscale(raw_frames) num_channel = 1 else: output_frames = raw_frames num_channel = 3 frame_height = tf.to_float(tf.shape(output_frames)[1]) frame_width = tf.to_float(tf.shape(output_frames)[2]) if mix: p = tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32) scale = (tf.to_float(tf.greater(p, 0.1)) + tf.to_float(tf.greater(p, 0.2)) + tf.to_float(tf.greater(p, 0.3))) / 3 else: scale = 1.0 new_max_shift = max_shift * scale rand_shift_base = tf.random_uniform([num_frame, 8], minval=-new_max_shift, maxval=new_max_shift, dtype=tf.float32) crop_width = frame_width - 2 * new_max_shift - 1 crop_height = frame_height - 2 * new_max_shift - 1 ref_window = tf.to_float(tf.stack([0, 0, 0, crop_height - 1, crop_width - 1, 0, crop_width - 1, crop_height - 1])) if screen: new_shift_list = [] flag_list = [] hmg_list = [] src_points = tf.reshape(ref_window, [4, 2]) for i in range(num_frame): dst_points = tf.reshape(rand_shift_base[i] + ref_window + new_max_shift, [4, 2]) hmg = calc_homography_from_points(src_points, dst_points) hmg_list.append(hmg) for i in range(num_frame - 1): hmg = tf.matmul(tf.matrix_inverse(hmg_list[i + 1]), hmg_list[i]) shift = homography_to_shifts(hmg, crop_width, crop_height) angles = calc_homography_distortion(crop_width, crop_height, shift) max_angle = tf.reduce_min(angles) flag = tf.to_float(max_angle >= -0.707) flag_list.append(flag) if i > 0: new_shift = rand_shift_base[i] * flag * flag_list[i - 1] else: new_shift = rand_shift_base[i] * flag new_shift_list.append(new_shift) new_shift_list.append(rand_shift_base[num_frame - 1] * flag_list[num_frame - 2]) rand_shift = tf.stack(new_shift_list) else: rand_shift = rand_shift_base mat_scale = tf.diag(tf.stack([crop_width / train_width, crop_height / train_height, 1.0])) inv_mat_scale = tf.matrix_inverse(mat_scale) hmg_list = [] frame_list = [] for i in range(num_frame): src_points = tf.reshape(ref_window, [4, 2]) dst_points = tf.reshape(rand_shift[i] + ref_window + new_max_shift, [4, 2]) hmg = calc_homography_from_points(src_points, dst_points) hmg_list.append(hmg) transform = tf.reshape(hmg, [9]) / hmg[2, 2] warped = contrib_image.transform(output_frames[i], transform[:8], 'bilinear') crop_window = tf.expand_dims(tf.stack( [0, 0, (crop_height - 1) / (frame_height - 1), (crop_width - 1) / (frame_width - 1)]), 0) resized_base = tf.image.crop_and_resize( tf.expand_dims(warped, 0), crop_window, [0], [train_height, train_width]) resized = tf.squeeze(resized_base, [0]) noise_im = tf.truncated_normal(shape=tf.shape(resized), mean=0.0, stddev=pixel_noise, dtype=tf.float32) noise_frame = normalize_image(tf.to_float(resized) + noise_im) frame_list.append(noise_frame) noise_frames = tf.reshape( tf.stack(frame_list, 2), (train_height, train_width, num_frame * num_channel)) label_list = [] for i in range(num_frame - 1): hmg_combine = tf.matmul(tf.matrix_inverse(hmg_list[i + 1]), hmg_list[i]) hmg_final = tf.matmul(inv_mat_scale, tf.matmul(hmg_combine, mat_scale)) label = homography_to_shifts(hmg_final, train_width, train_height) label_list.append(label) labels = tf.reshape(tf.stack(label_list, 0), [(num_frame - 1) * 8]) if mode == 'train': min_after_dequeue = int(queue_size / 3) else: min_after_dequeue = batch_size * 3 batch_frames, batch_labels = tf.train.shuffle_batch( [noise_frames, labels], batch_size=batch_size, num_threads=num_threads, capacity=queue_size, min_after_dequeue=min_after_dequeue, enqueue_many=False) return tf.cast(batch_frames, tf.float32), tf.cast(batch_labels, tf.float32)
def read_data(self): """Provides images and camera intrinsics.""" with tf.name_scope("data_loading"): with tf.name_scope("enqueue_paths"): seed = random.randint(0, 2**31 - 1) self.file_lists = self.compile_file_list( self.data_dir, self.input_file) image_paths_queue = tf.train.string_input_producer( self.file_lists["image_file_list"], seed=seed, shuffle=self.shuffle, num_epochs=(1 if not self.shuffle else None), ) seg_paths_queue = tf.train.string_input_producer( self.file_lists["segment_file_list"], seed=seed, shuffle=self.shuffle, num_epochs=(1 if not self.shuffle else None), ) img_reader = tf.WholeFileReader() _, image_contents = img_reader.read(image_paths_queue) seg_reader = tf.WholeFileReader() _, seg_contents = seg_reader.read(seg_paths_queue) if self.file_extension == "jpg": image_seq = tf.image.decode_jpeg(image_contents) seg_seq = tf.image.decode_jpeg(seg_contents, channels=3) elif self.file_extension == "png": image_seq = tf.image.decode_png(image_contents, channels=3) seg_seq = tf.image.decode_png(seg_contents, channels=3) with tf.name_scope("load_intrinsics"): intrinsics = tf.random.uniform(shape=(3, 3)) with tf.name_scope("convert_image"): image_seq = self.preprocess_image( image_seq) # Converts to float. if self.random_color: with tf.name_scope("image_augmentation"): image_seq = self.augment_image_colorspace(image_seq) image_stack = self.unpack_images(image_seq) seg_stack = self.unpack_images(seg_seq) if self.flipping_mode != FLIP_NONE: random_flipping = self.flipping_mode == FLIP_RANDOM with tf.name_scope("image_augmentation_flip"): image_stack, seg_stack, intrinsics = self.augment_images_flip( image_stack, seg_stack, intrinsics, randomized=random_flipping) if self.random_scale_crop: with tf.name_scope("image_augmentation_scale_crop"): image_stack, seg_stack, intrinsics = self.augment_images_scale_crop( image_stack, seg_stack, intrinsics, self.img_height, self.img_width, ) with tf.name_scope("multi_scale_intrinsics"): intrinsic_mat = self.get_multi_scale_intrinsics( intrinsics, self.num_scales) intrinsic_mat.set_shape([self.num_scales, 3, 3]) intrinsic_mat_inv = tf.matrix_inverse(intrinsic_mat) intrinsic_mat_inv.set_shape([self.num_scales, 3, 3]) if self.imagenet_norm: im_mean = tf.tile(tf.constant(IMAGENET_MEAN), multiples=[self.seq_length]) im_sd = tf.tile(tf.constant(IMAGENET_SD), multiples=[self.seq_length]) image_stack_norm = (image_stack - im_mean) / im_sd else: image_stack_norm = image_stack with tf.name_scope("batching"): if self.shuffle: ( image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv, ) = tf.train.shuffle_batch( [ image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv, ], batch_size=self.batch_size, num_threads=self.threads, capacity=self.queue_size + QUEUE_BUFFER * self.batch_size, min_after_dequeue=self.queue_size, ) else: ( image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv, ) = tf.train.batch( [ image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv, ], batch_size=self.batch_size, num_threads=1, capacity=self.queue_size + QUEUE_BUFFER * self.batch_size, ) return ( image_stack, image_stack_norm, seg_stack, intrinsic_mat, intrinsic_mat_inv, )
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 2 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes * kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes * kp_uv_entries cam_matrix_entries = 9 record_bytes += encoding_bytes * cam_matrix_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes hand_parts_bytes = self.image_size[0] * self.image_size[1] record_bytes += hand_parts_bytes kp_vis_bytes = self.num_kp record_bytes += kp_vis_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read( tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes * kp_xyz_entries # calculate palm coord if not self.use_wrist_coord: palm_coord_l = tf.expand_dims( 0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0) palm_coord_r = tf.expand_dims( 0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0) keypoint_xyz = tf.concat([ palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r, keypoint_xyz[-20:, :] ], 0) data_dict['keypoint_xyz'] = keypoint_xyz # 2. Read keypoint uv keypoint_uv = tf.cast( tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_uv_entries]), [self.num_kp, 2]), tf.int32) bytes_read += encoding_bytes * kp_uv_entries keypoint_uv = tf.cast(keypoint_uv, tf.float32) # calculate palm coord if not self.use_wrist_coord: palm_coord_uv_l = tf.expand_dims( 0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0) palm_coord_uv_r = tf.expand_dims( 0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0) keypoint_uv = tf.concat([ palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r, keypoint_uv[-20:, :] ], 0) if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv += noise data_dict['keypoint_uv'] = keypoint_uv # 3. Camera intrinsics cam_mat = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [cam_matrix_entries]), [3, 3]) bytes_read += encoding_bytes * cam_matrix_entries data_dict['cam_mat'] = cam_mat # decode to uint8 bytes_read += 2 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image # 5. Read mask hand_parts_mask = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]), [self.image_size[0], self.image_size[1]]) hand_parts_mask = tf.cast(hand_parts_mask, tf.int32) bytes_read += hand_parts_bytes data_dict['hand_parts'] = hand_parts_mask hand_mask = tf.greater(hand_parts_mask, 1) bg_mask = tf.logical_not(hand_mask) data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2), tf.int32) # 6. Read visibilty keypoint_vis = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]), [self.num_kp]) keypoint_vis = tf.cast(keypoint_vis, tf.bool) bytes_read += kp_vis_bytes # calculate palm visibility if not self.use_wrist_coord: palm_vis_l = tf.expand_dims( tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0) palm_vis_r = tf.expand_dims( tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0) keypoint_vis = tf.concat([ palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:] ], 0) data_dict['keypoint_vis'] = keypoint_vis assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints""" # figure out dominant hand by analysis of the segmentation mask one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like( hand_parts_mask) cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map), tf.less(hand_parts_mask, one_map * 18)) cond_r = tf.greater(hand_parts_mask, one_map * 17) hand_map_l = tf.where(cond_l, one_map, zero_map) hand_map_r = tf.where(cond_r, one_map, zero_map) num_px_left_hand = tf.reduce_sum(hand_map_l) num_px_right_hand = tf.reduce_sum(hand_map_r) # PRODUCE the 21 subset using the segmentation masks # We only deal with the more prominent hand for each frame and discard the second set of keypoints kp_coord_xyz_left = keypoint_xyz[:21, :] kp_coord_xyz_right = keypoint_xyz[-21:, :] cond_left = tf.logical_and( tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool), tf.greater(num_px_left_hand, num_px_right_hand)) kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left, kp_coord_xyz_right) hand_side = tf.where( tf.greater(num_px_left_hand, num_px_right_hand), tf.constant(0, dtype=tf.int32), tf.constant(1, dtype=tf.int32)) # left hand = 0; right hand = 1 data_dict['hand_side'] = tf.one_hot(hand_side, depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) data_dict['keypoint_xyz21'] = kp_coord_xyz21 # make coords relative to root joint kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt( tf.reduce_sum( tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict[ 'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze( kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left)) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) # Set of 21 for visibility keypoint_vis_left = keypoint_vis[:21] keypoint_vis_right = keypoint_vis[-21:] keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left, keypoint_vis_right) data_dict['keypoint_vis21'] = keypoint_vis21 # Set of 21 for UV coordinates keypoint_uv_left = keypoint_uv[:21, :] keypoint_uv_right = keypoint_uv[-21:, :] keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left, keypoint_uv_right) data_dict['keypoint_uv21'] = keypoint_uv21 """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([ 2, ]) if self.crop_center_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze( tf.random_uniform([1], minval=1.0, maxval=1.2)) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2 * tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond( tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1] ) * scale + self.crop_size // 2 keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0] ) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [ 1, ]) scale_matrix = tf.dynamic_stitch([ [0], [1], [2], [3], [4], [5], [6], [7], [8] ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [ 1, ]) trans2 = tf.reshape(trans2, [ 1, ]) trans_matrix = tf.dynamic_stitch( [[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, cam_mat)) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.scale_to_size: image, keypoint_uv21, keypoint_vis21 = data_dict[ 'image'], data_dict['keypoint_uv21'], data_dict[ 'keypoint_vis21'] s = image.get_shape().as_list() image = tf.image.resize_images(image, self.scale_target_size) scale = (self.scale_target_size[0] / float(s[0]), self.scale_target_size[1] / float(s[1])) keypoint_uv21 = tf.stack([ keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0] ], 1) data_dict = dict( ) # delete everything else because the scaling makes the data invalid anyway data_dict['image'] = image data_dict['keypoint_uv21'] = keypoint_uv21 data_dict['keypoint_vis21'] = keypoint_vis21 elif self.random_crop_to_size: tensor_stack = tf.concat([ data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32) ], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop( tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict( ) # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
# creating list of correct answers to allow for interation when checking for correct answers t_test_list = X_test['Outcome'] del X_test['Outcome'] # remove last column del X_train['Outcome'] # remove last column n_train,m = X_train.shape n_test,m = X_test.shape # define the tensors X = tf.placeholder(tf.float64, shape=(None, m), name='X') # input features vector t = tf.placeholder(tf.float64, shape=(None, 1), name='t') # target values n = tf.placeholder(tf.float64, name='n') # number of samples XT = tf.transpose(X) w = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT,X)), XT), t) # w = inv(X'*X)*X'*t # predicted value y = tf.matmul(X,w) # mean squared error of the prediction training set MSE = tf.div(tf.matmul(tf.transpose(y-t), y-t), n) w_star = tf.placeholder(tf.float64, shape=(m, 1), name='w_star') y_test = tf.matmul(X, w_star) with tf.Session() as sess: # running tensorflow sessions MSE_train_val, w_val = \ sess.run([MSE, w], feed_dict={X : X_train, t : t_train, n : n_train})
def posdef_inv_matrix_inverse(tensor, identity, damping): """Computes inverse(tensor + damping * identity) directly.""" return tf.matrix_inverse(tensor + damping * identity)
def build_train_graph(self, inputs, min_depth, max_depth, num_mpi_planes, learning_rate=0.0002, beta1=0.9, vgg_model_file=None, global_step=0): """Construct the training computation graph. Args: inputs: dictionary of tensors (see 'input_data' below) needed for training min_depth: minimum depth for the PSV and MPI planes max_depth: maximum depth for the PSV and MPI planes num_mpi_planes: number of MPI planes to infer learning_rate: learning rate beta1: hyperparameter for Adam vgg_model_file: path to vgg weights (needed when vgg loss is used) global_step: current optimization step Returns: A train_op to be used for training. """ print("starting to build graph") with tf.name_scope("input_size_randomization"): dim_choices = tf.constant([[1, 16], [2, 32], [4, 32], [4, 64], [4, 128], [8, 32], [8, 64], [8, 128]], dtype=tf.int32) rand_dim = tf.random_shuffle(dim_choices)[0, :] height_div = rand_dim[0] width_div = rand_dim[0] num_mpi_planes = rand_dim[1] tf.summary.scalar("num_mpi_planes", num_mpi_planes) with tf.name_scope("setup"): mpi_planes = self.inv_depths(min_depth, max_depth, num_mpi_planes) with tf.name_scope("input_data"): raw_tgt_image = inputs["tgt_image"] raw_ref_image = inputs["ref_image"] raw_src_images = inputs["src_images"] _, img_height, img_width, _ = raw_src_images.get_shape().as_list( ) img_height = img_height // height_div img_width = img_width // width_div raw_tgt_image = tf.image.convert_image_dtype( raw_tgt_image, dtype=tf.float32) raw_ref_image = tf.image.convert_image_dtype( raw_ref_image, dtype=tf.float32) raw_src_images = tf.image.convert_image_dtype( raw_src_images, dtype=tf.float32) raw_tgt_image = tf.image.resize_area(raw_tgt_image, [img_height, img_width]) raw_ref_image = tf.image.resize_area(raw_ref_image, [img_height, img_width]) raw_src_images = tf.image.resize_area(raw_src_images, [img_height, img_width]) tgt_pose = inputs["tgt_pose"] ref_pose = inputs["ref_pose"] src_poses = inputs["src_poses"] intrinsics = inputs["intrinsics"] # Scale intrinsics based on size randomization intrinsics = tf.concat([ intrinsics[:, 0:1, :] / tf.to_float(width_div), intrinsics[:, 1:2, :] / tf.to_float(height_div), intrinsics[:, 2:3, :] ], axis=1) inputs["intrinsics"] = intrinsics _, num_source, _, _ = src_poses.get_shape().as_list() with tf.name_scope("inference"): print("setting up MPI inference") num_mpi_planes = tf.shape(mpi_planes)[0] pred = self.infer_mpi(raw_src_images, raw_ref_image, ref_pose, src_poses, intrinsics, num_mpi_planes, mpi_planes) rgba_layers = pred["rgba_layers"] rgba_layers_refine = pred["rgba_layers_refine"] stuff_behind = pred["stuff_behind"] refine_input_mpi = pred["refine_input_mpi"] psv = pred["psv"] with tf.name_scope("synthesis"): print("setting up rendering") rel_pose = tf.matmul(tgt_pose, tf.matrix_inverse(ref_pose)) output_image, output_layers = self.mpi_render_view( rgba_layers, rel_pose, mpi_planes, intrinsics) output_alpha = output_layers[Ellipsis, -1] output_image_refine, _ = self.mpi_render_view( rgba_layers_refine, rel_pose, mpi_planes, intrinsics) with tf.name_scope("loss"): print("computing losses") # Mask loss for pixels outside reference frustum loss_mask = tf.where( tf.equal( tf.reduce_min( tf.abs(tf.reduce_sum(output_layers, axis=-1)), axis=3, keep_dims=True), 0.0), tf.zeros_like(output_alpha[:, :, :, 0:1]), tf.ones_like(output_alpha[:, :, :, 0:1])) loss_mask = tf.stop_gradient(loss_mask) tf.summary.image("loss_mask", loss_mask) # Helper functions for loss def compute_error(real, fake, mask): return tf.reduce_mean(mask * tf.abs(fake - real)) # Normalized VGG loss (from # https://github.com/CQFIO/PhotographicImageSynthesis) downsample = lambda tensor, ds: tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1], "SAME") def vgg_loss(raw_tgt_image, output_image, loss_mask): """Compute VGG loss.""" vgg_real = build_vgg19(raw_tgt_image * 255.0, vgg_model_file) rescaled_output_image = (output_image + 1.)/2. * 255.0 vgg_fake = build_vgg19( rescaled_output_image, vgg_model_file, reuse=True) p0 = compute_error(vgg_real["input"], vgg_fake["input"], loss_mask) p1 = compute_error(vgg_real["conv1_2"], vgg_fake["conv1_2"], loss_mask)/2.6 p2 = compute_error(vgg_real["conv2_2"], vgg_fake["conv2_2"], downsample(loss_mask, 2))/4.8 p3 = compute_error(vgg_real["conv3_2"], vgg_fake["conv3_2"], downsample(loss_mask, 4))/3.7 p4 = compute_error(vgg_real["conv4_2"], vgg_fake["conv4_2"], downsample(loss_mask, 8))/5.6 p5 = compute_error(vgg_real["conv5_2"], vgg_fake["conv5_2"], downsample(loss_mask, 16))*10/1.5 total_loss = p0+p1+p2+p3+p4+p5 return total_loss, vgg_real, vgg_fake vgg_loss_initial, _, _ = vgg_loss(raw_tgt_image, output_image, loss_mask) tf.summary.scalar("vgg_loss_initial", vgg_loss_initial) total_loss = vgg_loss_initial vgg_loss_refine, _, _ = vgg_loss(raw_tgt_image, output_image_refine, loss_mask) tf.summary.scalar("vgg_loss_refine", vgg_loss_refine) total_loss += vgg_loss_refine with tf.name_scope("train_op"): print("setting up train op") train_vars = [var for var in tf.trainable_variables()] optim = tf.train.AdamOptimizer(learning_rate, beta1) grads_and_vars = optim.compute_gradients(total_loss, var_list=train_vars) train_op = [optim.apply_gradients(grads_and_vars)] # Summaries tf.summary.scalar("total_loss", total_loss) # Source images for i in range(num_source): src_image = raw_src_images[:, :, :, i*3:(i+1)*3] tf.summary.image("src_image_%d" % i, src_image) # Output image tf.summary.image("output_image", self.deprocess_image(output_image)) # Refined output image tf.summary.image("output_image_refine", self.deprocess_image(output_image_refine)) # Target image tf.summary.image("tgt_image", raw_tgt_image) # Ref image tf.summary.image("ref_image", raw_ref_image) # Predicted color and alpha layers, and PSV num_summ = 16 # Number of plane summaries to show in tensorboard for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers[:, :, :, ind, :3] alpha = rgba_layers[:, :, :, ind, -1:] ref_plane = psv[:, :, :, ind, 3:6] source_plane = psv[:, :, :, ind, :3] output_rgb = output_layers[:, :, :, ind, :3] tf.summary.image("rgb_layer_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_%d" % i, alpha) tf.summary.image("rgba_layer_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("psv_avg_%d" % i, (self.deprocess_image(0.5*ref_plane + 0.5*source_plane))) tf.summary.image("output_rgb_%d" % i, self.deprocess_image(output_rgb)) tf.summary.image("psv_ref_%d" % i, self.deprocess_image(ref_plane)) tf.summary.image("psv_source_%d" % i, self.deprocess_image(source_plane)) # Cumulative rendered images and refined MPI for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers_refine[:, :, :, ind, :3] alpha = rgba_layers_refine[:, :, :, ind, 3:] render = stuff_behind[:, :, :, ind, :3] input_colors = refine_input_mpi[:, :, :, ind, :3] tf.summary.image("rgb_layer_refine_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_refine_%d" % i, alpha) tf.summary.image("rgba_layer_refine_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("cumulative_render_%d" % i, self.deprocess_image(render)) tf.summary.image("input_colors_refine_%d" % i, self.deprocess_image(input_colors)) return train_op
def overlap_mask(depth1, pose1_c2w, depth2, pose2_c2w, intrinsics): """Compute the overlap masks of two views using triangulation. The masks have the same shape of the input images. A pixel value is true if it can be seen by both cameras. Args: depth1: [HEIGHT, WIDTH, 1] the depth map of the first view. pose1_c2w: [3, 4] camera pose matrix (camera to world) of the first view. pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation. depth2: [HEIGHT, WIDTH, 1] the depth map of the second view. pose2_c2w: [3, 4] camera pose matrix (camera to world) of the second view. pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation. intrinsics: [3, 3] camera's intrinsic matrix. Returns: [HEIGHT, WIDTH] two overlap masks of the two inputs respectively. """ pose1_w2c = tf.matrix_inverse( tf.concat([pose1_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] pose2_w2c = tf.matrix_inverse( tf.concat([pose2_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] p_world1 = image_to_world_projection(depth1, intrinsics, pose1_c2w) p_image1_in_2, z1_c2 = world_to_image_projection(p_world1, intrinsics, pose2_w2c) p_world2 = image_to_world_projection(depth2, intrinsics, pose2_c2w) p_image2_in_1, z2_c1 = world_to_image_projection(p_world2, intrinsics, pose1_w2c) shape = depth1.shape.as_list() height, width = shape[0], shape[1] height = tf.cast(height, tf.float32) width = tf.cast(width, tf.float32) # Error tolerance. eps = 1e-4 # check the object seen by camera 2 is also projected to camera 1's image # plane and in front of the camera 1. mask_h2_in_1 = tf.logical_and( tf.less_equal(p_image2_in_1[:, :, 1], height + eps), tf.greater_equal(p_image2_in_1[:, :, 1], 0. - eps)) mask_w2_in_1 = tf.logical_and( tf.less_equal(p_image2_in_1[:, :, 0], width + eps), tf.greater_equal(p_image2_in_1[:, :, 0], 0. - eps)) # check the projected points are within the image boundaries and in front of # the camera. mask2_in_1 = tf.logical_and(tf.logical_and(mask_h2_in_1, mask_w2_in_1), tf.squeeze(z2_c1, -1) > 0) # check the object seen by camera 1 is also projected to camera 2's image # plane and in front of the camera 2. mask_h1_in_2 = tf.logical_and( tf.less_equal(p_image1_in_2[:, :, 1], height + eps), tf.greater_equal(p_image1_in_2[:, :, 1], 0. - eps)) mask_w1_in_2 = tf.logical_and( tf.less_equal(p_image1_in_2[:, :, 0], width + eps), tf.greater_equal(p_image1_in_2[:, :, 0], 0. - eps)) # check the projected points are within the image boundaries and in front of # the camera. mask1_in_2 = tf.logical_and(tf.logical_and(mask_h1_in_2, mask_w1_in_2), tf.squeeze(z1_c2, -1) > 0) return mask1_in_2, mask2_in_1
import tensorflow.compat.v1 as tf import numpy as np from tensorflow.python.framework.ops import disable_eager_execution disable_eager_execution() A = tf.placeholder(dtype=tf.float64, shape=[2, 2]) b = tf.placeholder(dtype=tf.float64, shape=[2]) #矩阵函数的使用 A_pow = tf.sin(A) A_relu = tf.nn.relu(A) A_inverse = tf.matrix_inverse(A) A_T = tf.transpose(A) b_diag = tf.diag(b) I = tf.eye(6) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) print('\n-------------A_pow-----------------------') print(sess.run(A_pow, feed_dict={A: [[1, 2], [-1, 1]], b: [1, 1]})) print(sess.run(tf.sin(tf.constant([[1, 2], [-1, 1]],dtype=tf.float64)))) print('\n------------------------------------') print(sess.run(A_relu, feed_dict={A: [[1, 2], [-1, 1]],