def _build_motion_graph(self): self.motion_net = MotionNetwork(self.cfg.MOTION, mode=self.mode, use_regressor=self.use_regressor, is_calibrated=self.is_calibrated, is_training=False) images = self.images_placeholder[tf.newaxis] depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] do_init = self.init_placeholder intrinsics = self.intrinsics_placeholder[tf.newaxis] edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1) # convert pose matrix into SE3 object Ts = VideoSE3Transformation(matrix=poses) Ts, intrinsics = self.motion_net.forward(Ts, images, depths, intrinsics, edge_inds, init=do_init) self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0) self.outputs['intrinsics'] = intrinsics[0] self.outputs['weights'] = self.motion_net.weights_history[-1]
def _build_motion_graph(self): """ Motion graph updates poses using depth as input """ self.motion_net = MotionNetwork( self.cfg.MOTION, mode='global', # use global optimization mode is_training=False) images = self.images_placeholder[tf.newaxis] depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1) # convert pose matricies into SE3 object Ts = VideoSE3Transformation(matrix=poses) batch, num = Ts.shape() Ts, intrinsics = self.motion_net.forward( Ts, images, depths, intrinsics, inds=edge_inds, num_fixed=self.fixed_placeholder) # convert SE3 object back to matrix representation self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0) self.outputs['intrinsics'] = intrinsics
class DeepV2D: def __init__(self, cfg, ckpt, is_calibrated=True, use_fcrn=False, use_regressor=True, image_dims=None, mode='keyframe'): self.cfg = cfg self.ckpt = ckpt self.mode = mode self.use_fcrn = use_fcrn self.use_regressor = use_regressor self.is_calibrated = is_calibrated if image_dims is not None: self.image_dims = image_dims else: if cfg.STRUCTURE.MODE == 'concat': self.image_dims = [ cfg.INPUT.FRAMES, cfg.INPUT.HEIGHT, cfg.INPUT.WIDTH ] else: self.image_dims = [None, cfg.INPUT.HEIGHT, cfg.INPUT.WIDTH] self.outputs = {} self._create_placeholders() self._build_motion_graph() self._build_depth_graph() self._build_reprojection_graph() self._build_visibility_graph() self._build_point_cloud_graph() self.depths = [] self.poses = [] if self.use_fcrn: self._build_fcrn_graph() self.saver = tf.train.Saver(tf.model_variables()) def set_session(self, sess): self.sess = sess sess.run(tf.global_variables_initializer()) self.saver.restore(self.sess, self.ckpt) if self.use_fcrn: fcrn_vars = {} for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="FCRN"): fcrn_vars[var.name.replace('FCRN/', '').replace(':0', '')] = var fcrn_saver = tf.train.Saver(fcrn_vars) fcrn_saver.restore(sess, 'models/NYU_FCRN.ckpt') def _create_placeholders(self): frames, ht, wd = self.image_dims self.images_placeholder = tf.placeholder(tf.float32, [frames, ht, wd, 3]) if self.mode == 'keyframe': self.depths_placeholder = tf.placeholder(tf.float32, [1, ht, wd]) else: self.depths_placeholder = tf.placeholder(tf.float32, [frames, ht, wd]) self.poses_placeholder = tf.placeholder(tf.float32, [frames, 4, 4]) self.intrinsics_placeholder = tf.placeholder(tf.float32, [4]) self.init_placeholder = tf.placeholder(tf.bool, []) # placeholders for storing graph adj_list and edges self.edges_placeholder = tf.placeholder(tf.int32, [None, 2]) self.adj_placeholder = tf.placeholder(tf.int32, [None, None]) def _build_motion_graph(self): self.motion_net = MotionNetwork(self.cfg.MOTION, mode=self.mode, use_regressor=self.use_regressor, is_calibrated=self.is_calibrated, is_training=False) images = self.images_placeholder[tf.newaxis] depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] do_init = self.init_placeholder intrinsics = self.intrinsics_placeholder[tf.newaxis] edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1) # convert pose matrix into SE3 object Ts = VideoSE3Transformation(matrix=poses) Ts, intrinsics = self.motion_net.forward(Ts, images, depths, intrinsics, edge_inds, init=do_init) self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0) self.outputs['intrinsics'] = intrinsics[0] self.outputs['weights'] = self.motion_net.weights_history[-1] def _build_depth_graph(self): self.depth_net = DepthNetwork(self.cfg.STRUCTURE, is_training=False) images = self.images_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] # convert pose matrix into SE3 object Ts = VideoSE3Transformation(matrix=poses) adj_list = None if self.mode == 'global': adj_list = self.adj_placeholder depths = self.depth_net.forward(Ts, images, intrinsics, adj_list) self.outputs['depths'] = depths def _build_point_cloud_graph(self): """Use poses and depth maps to create point cloud""" depths = self.depths_placeholder[tf.newaxis] images = self.images_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] intrinsics = intrinsics_vec_to_matrix(intrinsics) depths_pad = tf.pad(depths, [[0, 0], [0, 0], [0, 1], [0, 1]], "CONSTANT") depths_grad = \ (depths_pad[:, :, 1:, :-1] - depths_pad[:, :, :-1, :-1])**2 + \ (depths_pad[:, :, :-1, 1:] - depths_pad[:, :, :-1, :-1])**2 # don't use large depths for point cloud and ignore boundary regions valid = (depths < 5.0) & (depths_grad < 0.01) # depths, intrinsics = rescale_depths_and_intrinsics(depths, intrinsics, downscale=4) batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4) ii, jj = tf.meshgrid(tf.range(1), tf.range(0, num)) ii = tf.reshape(ii, [-1]) jj = tf.reshape(jj, [-1]) Ts = VideoSE3Transformation(matrix=poses) X0 = projective_ops.backproject(depths, intrinsics) # transform point cloud into coordinate system defined by first frame X1 = (Ts.gather(ii) * Ts.gather(jj).inv())(X0) crop_h = 12 crop_w = 32 X1 = X1[:, :, crop_h:-crop_h, crop_w:-crop_w] valid = valid[:, :, crop_h:-crop_h, crop_w:-crop_w] images = images[:, :, crop_h:-crop_h, crop_w:-crop_w, ::-1] X1 = tf.reshape(X1, [-1, 3]) colors = tf.reshape(images, [-1, 3]) valid_inds = tf.where(tf.reshape(valid, [-1])) valid_inds = tf.reshape(valid_inds, [-1]) X1 = tf.gather(X1, valid_inds, axis=0) colors = tf.gather(colors, valid_inds, axis=0) self.outputs['point_cloud'] = (X1, colors) def _build_reprojection_graph(self): """ Used to project depth from keyframes onto new frame """ EPS = 1e-8 depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4) Ts = VideoSE3Transformation(matrix=poses) intrinsics = intrinsics_vec_to_matrix(intrinsics) ii, jj = tf.meshgrid(tf.range(0, num), tf.range(num, num + 1)) ii = tf.reshape(ii, [-1]) jj = tf.reshape(jj, [-1]) Tij = Ts.gather(jj) * Ts.gather(ii).inv() X0 = projective_ops.backproject(depths, intrinsics) X1 = Tij(X0) coords = projective_ops.project(X1, intrinsics) depths = X1[..., 2] indicies = tf.cast(coords[..., ::-1] + .5, tf.int32) indicies = tf.reshape(indicies, [-1, 2]) depths = tf.reshape(depths, [-1]) depth = tf.scatter_nd(indicies, depths, [ht, wd]) count = tf.scatter_nd(indicies, tf.ones_like(depths), [ht, wd]) depth = depth / (count + EPS) self.outputs['depth_reprojection'] = depth def _build_visibility_graph(self): """ Find induced optical flow between pairs of frames """ depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] Ts = VideoSE3Transformation(matrix=poses) ii, jj = tf.unstack(self.edges_placeholder, num=2, axis=-1) intrinsics = intrinsics_vec_to_matrix(intrinsics) depths, intrinsics = rescale_depths_and_intrinsics(depths, intrinsics, downscale=4) ht = tf.cast(tf.shape(depths)[2], tf.float32) wd = tf.cast(tf.shape(depths)[3], tf.float32) depths = tf.gather(depths, ii, axis=1) Tij = Ts.gather(jj) * Ts.gather(ii).inv() flow = Tij.induced_flow(depths, intrinsics) coords = Tij.transform(depths, intrinsics) flo_graph = tf.sqrt(tf.reduce_sum(flow**2, axis=-1)) flo_graph = tf.reduce_mean(flo_graph, [-1, -2]) contained = tf.to_float((coords[..., 0] > 0.0) & (coords[..., 0] < wd) & (coords[..., 1] > 0.0) & (coords[..., 1] < ht)) vis_graph = tf.reduce_mean(contained, [-1, -2]) self.outputs['visibility'] = (flo_graph[0], vis_graph[0], flow) def _build_fcrn_graph(self): """ Build single image initializion graph""" images = self.images_placeholder batch, ht, wd, _ = tf.unstack(tf.shape(images), num=4) with tf.variable_scope("FCRN") as scope: # crop out boarder and flip color channels fcrn_input = tf.image.resize_area(images[:, 4:-4, 6:-6, ::-1], [228, 304]) net = fcrn.ResNet50UpProj({'data': fcrn_input}, batch, 1, False) fcrn_output = tf.stop_gradient(net.get_output()) fcrn_output = tf.image.resize_bilinear(fcrn_output, [ht, wd]) self.outputs['fcrn'] = tf.squeeze(fcrn_output, -1) def compute_visibility_matrix(self): """ Computes a matrix of optical flow and visibility between all pairs of frames Ex. flo_matrix[i,j] is the mean optical flow between camera i and camera j Ex. vis_matrix[i,j] is the portion of points in camera i visibile in camera j """ num = len(self.images) ii, jj = np.meshgrid(np.arange(num), np.arange(num)) ii = np.reshape(ii, [-1]) jj = np.reshape(jj, [-1]) edges = np.stack([jj, ii], axis=-1) feed_dict = { self.depths_placeholder: self.depths, self.poses_placeholder: self.poses, self.edges_placeholder: edges, self.intrinsics_placeholder: self.intrinsics } flo_graph, vis_graph, flow = self.sess.run(self.outputs['visibility'], feed_dict=feed_dict) flo_matrix = flo_graph.reshape(num, num) vis_matrix = vis_graph.reshape(num, num) return flo_matrix, vis_matrix, flow def reproject_depth(self, query_pose): """ Use depth estimates and poses to estimate depth map at a new camera location """ poses = np.concatenate([self.poses, query_pose[np.newaxis]], axis=0) feed_dict = { self.depths_placeholder: self.depths, self.poses_placeholder: poses, self.intrinsics_placeholder: self.intrinsics } depth = self.sess.run(self.outputs['depth_reprojection'], feed_dict=feed_dict) return fill_depth(depth) def deepv2d_init(self): if self.use_fcrn: if self.mode == 'keyframe': feed_dict = {self.images_placeholder: self.images[[0]]} else: feed_dict = {self.images_placeholder: self.images} self.depths = self.sess.run(self.outputs['fcrn'], feed_dict=feed_dict) else: if self.mode == 'keyframe': images = np.stack([self.images[0]] * self.images.shape[0], axis=0) poses = np.stack([np.eye(4)] * self.images.shape[0], axis=0) feed_dict = { self.images_placeholder: images, self.poses_placeholder: poses, self.intrinsics_placeholder: self.intrinsics } else: ii = np.arange(self.images.shape[0]) adj = np.stack([ii, ii], axis=-1) feed_dict = { self.images_placeholder: self.images, self.poses_placeholder: self.poses, self.adj_placeholder: adj, self.intrinsics_placeholder: self.intrinsics } self.depths = self.sess.run(self.outputs['depths'], feed_dict=feed_dict) def update_poses(self, itr=0): n = self.images.shape[0] if self.mode == 'keyframe': ii, jj = np.meshgrid(np.arange(1), np.arange(1, n)) else: ii, jj = np.meshgrid(np.arange(n), np.arange(n)) ii = ii.reshape(-1) jj = jj.reshape(-1) v = ~np.equal(ii, jj) # don't use pairs with self loop edges = np.stack([ii[v], jj[v]], axis=-1) feed_dict = { self.images_placeholder: self.images, self.depths_placeholder: self.depths, self.poses_placeholder: self.poses, self.edges_placeholder: edges, self.init_placeholder: (itr == 0), self.intrinsics_placeholder: self.intrinsics } # execute pose subgraph outputs = [ self.outputs['poses'], self.outputs['intrinsics'], self.outputs['weights'] ] self.poses, self.intrinsics, self.weights = self.sess.run( outputs, feed_dict=feed_dict) if not self.cfg.MOTION.IS_CALIBRATED: print("intrinsics (fx, fy, cx, cy): ", self.intrinsics) def update_depths(self, itr=0): n = self.images.shape[0] inds_list = [] if self.mode == 'keyframe': feed_dict = { self.images_placeholder: self.images, self.poses_placeholder: self.poses, self.intrinsics_placeholder: self.intrinsics } self.depths = self.sess.run(self.outputs['depths'], feed_dict=feed_dict) else: for i in range(n): inds = np.arange(n).tolist() inds.remove(i) inds = [i] + inds inds_list.append(inds) adj_list = np.array(inds_list, dtype=np.int32) if n <= 4: feed_dict = { self.images_placeholder: self.images, self.poses_placeholder: self.poses, self.adj_placeholder: adj_list, self.intrinsics_placeholder: self.intrinsics } self.depths = self.sess.run(self.outputs['depths'], feed_dict=feed_dict) else: # we need to split up inference to fit in memory s = 2 for i in range(0, n, s): feed_dict = { self.images_placeholder: self.images, self.poses_placeholder: self.poses, self.adj_placeholder: adj_list[i:i + s], self.intrinsics_placeholder: self.intrinsics } self.depths[i:i + s] = self.sess.run( self.outputs['depths'], feed_dict=feed_dict) def vizualize_output(self, inds=[0]): feed_dict = { self.images_placeholder: self.images, self.depths_placeholder: self.depths, self.poses_placeholder: self.poses, self.intrinsics_placeholder: self.intrinsics } keyframe_image = self.images[0] keyframe_depth = self.depths[0] image_depth = vis.create_image_depth_figure(keyframe_image, keyframe_depth) cv2.imwrite('depth.png', image_depth[:, image_depth.shape[1] // 2:]) cv2.imshow('image_depth', image_depth / 255.0) print("Press any key to cotinue") cv2.waitKey() # use depth map to create point cloud point_cloud, point_colors = self.sess.run(self.outputs['point_cloud'], feed_dict=feed_dict) print("Press q to exit") vis.visualize_prediction(point_cloud, point_colors, self.poses) def __call__(self, images, intrinsics=None, iters=5, viz=False): n_frames = len(images) self.images = np.stack(images, axis=0) if intrinsics is None: # initialize intrinsics fx = images.shape[2] * 1.2 fy = images.shape[2] * 1.2 cx = images.shape[2] / 2.0 cy = images.shape[1] / 2.0 intrinsics = np.stack([fx, fy, cx, cy]) # (fx, cy, cx, cy) self.intrinsics = intrinsics poses = np.eye(4).reshape(1, 4, 4) poses = np.tile(poses, [n_frames, 1, 1]) self.poses = poses # initalize reconstruction self.deepv2d_init() for i in range(iters): self.update_poses(i) self.update_depths() if viz: self.vizualize_output() return self.depths, self.poses
class DeepV2DSLAM: def __init__(self, cfg, ckpt, n_keyframes=1, rate=2, use_fcrn=True, viz=True, mode='global', image_dims=[None, 480, 640]): self.cfg = cfg self.ckpt = ckpt self.viz = viz self.mode = mode self.use_fcrn = use_fcrn self.image_dims = image_dims self.index = 0 self.keyframe_inds = [] self.images = [] self.depths = [] self.poses = [] # tracking config parameters self.n_keyframes = n_keyframes # number of keyframes to use self.rate = rate # how often to sample new frames self.window = 3 # add edges if frames are within distance # build tensorflow graphs self.outputs = {} self._create_placeholders() self._build_motion_graph() self._build_depth_graph() self._build_reprojection_graph() self._build_visibility_graph() self._build_point_cloud_graph() if self.use_fcrn: self._build_fcrn_graph() self.saver = tf.train.Saver(tf.model_variables()) def set_session(self, sess): self.sess = sess self.saver.restore(self.sess, self.ckpt) if self.use_fcrn: fcrn_vars = {} for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="FCRN"): fcrn_vars[var.name.replace('FCRN/', '').replace(':0', '')] = var fcrn_saver = tf.train.Saver(fcrn_vars) fcrn_saver.restore(sess, 'models/NYU_FCRN.ckpt') def start_visualization(self, cinematic=False, render_path=None, clear_points=False): """ Start interactive slam visualization in seperate process """ # new points and poses get added to the queue self.queue = Queue() self.vis_counter = 0 self.viz = vis.InteractiveViz(self.queue, cinematic, render_path, clear_points) self.viz.start() def _create_placeholders(self): frames, ht, wd = self.image_dims self.images_placeholder = tf.placeholder(tf.float32, [frames, ht, wd, 3]) if self.mode == 'keyframe': self.depths_placeholder = tf.placeholder(tf.float32, [1, ht, wd]) else: self.depths_placeholder = tf.placeholder(tf.float32, [frames, ht, wd]) self.poses_placeholder = tf.placeholder(tf.float32, [frames, 4, 4]) self.intrinsics_placeholder = tf.placeholder(tf.float32, [4]) # placeholders for storing graph adj_list and edges self.edges_placeholder = tf.placeholder(tf.int32, [None, 2]) self.adj_placeholder = tf.placeholder(tf.int32, [None, None]) self.fixed_placeholder = tf.placeholder(tf.int32, []) self.init_placeholder = tf.placeholder(tf.bool, []) def _build_motion_graph(self): """ Motion graph updates poses using depth as input """ self.motion_net = MotionNetwork( self.cfg.MOTION, mode='global', # use global optimization mode is_training=False) images = self.images_placeholder[tf.newaxis] depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1) # convert pose matricies into SE3 object Ts = VideoSE3Transformation(matrix=poses) batch, num = Ts.shape() Ts, intrinsics = self.motion_net.forward( Ts, images, depths, intrinsics, inds=edge_inds, num_fixed=self.fixed_placeholder) # convert SE3 object back to matrix representation self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0) self.outputs['intrinsics'] = intrinsics def _build_depth_graph(self): """ Depth graph updates depth using poses as input """ self.depth_net = DepthNetwork(self.cfg.STRUCTURE, is_training=False) images = self.images_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] Ts = VideoSE3Transformation(matrix=poses) adj_list = None if self.mode == 'global': adj_list = self.adj_placeholder depths = self.depth_net.forward(Ts, images, intrinsics, adj_list) self.outputs['depths'] = depths def _build_visibility_graph(self): depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] Ts = VideoSE3Transformation(matrix=poses) ii, jj = tf.unstack(self.edges_placeholder, num=2, axis=-1) intrinsics = intrinsics_vec_to_matrix(intrinsics) depths, intrinsics = rescale_depths_and_intrinsics(depths, intrinsics, downscale=4) ht = tf.cast(tf.shape(depths)[2], tf.float32) wd = tf.cast(tf.shape(depths)[3], tf.float32) depths = tf.gather(depths, ii, axis=1) Tij = Ts.gather(jj) * Ts.gather(ii).inv() flow = Tij.induced_flow(depths, intrinsics) coords = Tij.transform(depths, intrinsics) # translation only rotation_mask = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0] flow_translation = Tij.induced_flow(depths, intrinsics) flo_graph = tf.sqrt(tf.reduce_sum(flow**2, axis=-1)) flo_graph = tf.reduce_mean(flo_graph, [-1, -2]) pos_graph = tf.sqrt(tf.reduce_sum(flow_translation**2, axis=-1)) pos_graph = tf.reduce_mean(pos_graph, [-1, -2]) contained = tf.to_float((coords[..., 0] > 0.0) & (coords[..., 0] < wd) & (coords[..., 1] > 0.0) & (coords[..., 1] < ht)) vis_graph = tf.reduce_mean(contained, [-1, -2]) self.outputs['visibility'] = (flo_graph[0], vis_graph[0]) def _build_fcrn_graph(self): """ Build single image initializion graph""" images = self.images_placeholder batch, ht, wd, _ = tf.unstack(tf.shape(images), num=4) with tf.variable_scope("FCRN") as scope: # crop out boarder and flip color channels fcrn_input = tf.image.resize_area(images[:, 4:-4, 6:-6, ::-1], [228, 304]) net = fcrn.ResNet50UpProj({'data': fcrn_input}, batch, 1, False) fcrn_output = tf.stop_gradient(net.get_output()) fcrn_output = tf.image.resize_bilinear(fcrn_output, [ht, wd]) self.outputs['fcrn'] = tf.squeeze(fcrn_output, -1) def compute_visibility_graph(self, edges=None): """ Computes a matrix of optical flow and visibility between all pairs of frames Ex. flo_matrix[i,j] is the mean optical flow between camera i and camera j Ex. vis_matrix[i,j] is the portion of points in camera i visibile in camera j """ vis_matrix = False if edges is None: num = len(self.keyframe_images) vis_matrix = True ii, jj = np.meshgrid(np.arange(num), np.arange(num)) ii = np.reshape(ii, [-1]) jj = np.reshape(jj, [-1]) edges = np.stack([jj, ii], axis=-1) feed_dict = { self.depths_placeholder: np.stack(self.keyframe_depths, axis=0), self.poses_placeholder: np.stack(self.keyframe_poses, axis=0), self.edges_placeholder: edges, self.intrinsics_placeholder: self.intrinsics } flo_graph, pos_graph = self.sess.run(self.outputs['visibility'], feed_dict=feed_dict) if vis_matrix: flo_matrix = flo_graph.reshape(num, num) pos_matrix = pos_graph.reshape(num, num) return flo_matrix, pos_matrix return flo_graph, pos_matrix def _build_point_cloud_graph(self): """Use poses and depth maps to create point cloud""" depths = self.depths_placeholder[tf.newaxis] images = self.images_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] intrinsics = intrinsics_vec_to_matrix(intrinsics) depths_pad = tf.pad(depths, [[0, 0], [0, 0], [0, 1], [0, 1]], "CONSTANT") depths_grad = \ (depths_pad[:, :, 1:, :-1] - depths_pad[:, :, :-1, :-1])**2 + \ (depths_pad[:, :, :-1, 1:] - depths_pad[:, :, :-1, :-1])**2 # don't use large depths for point cloud and ignore boundary regions valid = (depths < 6.0) & (depths_grad < 0.05) batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4) Ts = VideoSE3Transformation(matrix=poses) X0 = projective_ops.backproject(depths, intrinsics) # transform point cloud into world coordinaes X1 = Ts.inv()(X0) crop_h0 = 20 crop_h1 = 12 crop_w = 32 X1 = X1[:, :, crop_h0:-crop_h1, crop_w:-crop_w] valid = valid[:, :, crop_h0:-crop_h1, crop_w:-crop_w] images = images[:, :, crop_h0:-crop_h1, crop_w:-crop_w, ::-1] X1 = tf.reshape(X1, [-1, 3]) colors = tf.reshape(images, [-1, 3]) valid_inds = tf.where(tf.reshape(valid, [-1])) valid_inds = tf.reshape(valid_inds, [-1]) X1 = tf.gather(X1, valid_inds, axis=0) colors = tf.gather(colors, valid_inds, axis=0) self.outputs['point_cloud'] = (X1, colors) def _build_reprojection_graph(self): """ Used to project depth from keyframes onto new frame """ EPS = 1e-8 depths = self.depths_placeholder[tf.newaxis] poses = self.poses_placeholder[tf.newaxis] intrinsics = self.intrinsics_placeholder[tf.newaxis] batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4) Ts = VideoSE3Transformation(matrix=poses) intrinsics = intrinsics_vec_to_matrix(intrinsics) ii, jj = tf.meshgrid(tf.range(0, num), tf.range(num, num + 1)) ii = tf.reshape(ii, [-1]) jj = tf.reshape(jj, [-1]) Tij = Ts.gather(jj) * Ts.gather(ii).inv() X0 = projective_ops.backproject(depths, intrinsics) X1 = Tij(X0) coords = projective_ops.project(X1, intrinsics) depths = X1[..., 2] indicies = tf.cast(coords[..., ::-1] + .5, tf.int32) indicies = tf.reshape(indicies, [-1, 2]) depths = tf.reshape(depths, [-1]) depth = tf.scatter_nd(indicies, depths, [ht, wd]) count = tf.scatter_nd(indicies, tf.ones_like(depths), [ht, wd]) depth = depth / (count + EPS) self.outputs['depth_reprojection'] = depth def reproject_depth(self, query_pose, margin=2): """ Use depth estimates and poses to estimate depth map at a new camera location """ keyframe_pose = self.poses[self.keyframe_inds[-1]] poses = np.stack([keyframe_pose, query_pose], axis=0) keyframe_depth = self.depths[self.keyframe_inds[-1]] depths = keyframe_depth[np.newaxis] feed_dict = { self.depths_placeholder: depths, self.poses_placeholder: poses, self.intrinsics_placeholder: self.intrinsics } depth = self.sess.run(self.outputs['depth_reprojection'], feed_dict=feed_dict) return fill_depth(depth) def deepv2d_init(self): if self.use_fcrn: feed_dict = { self.images_placeholder: np.stack(self.images, axis=0) } depths_init = self.sess.run(self.outputs['fcrn'], feed_dict=feed_dict) else: ii = np.arange(len(self.images)) adj = np.stack([ii, ii], axis=-1) feed_dict = { self.images_placeholder: np.stack(self.images, axis=0), self.poses_placeholder: np.stack(self.poses, axis=0), self.adj_placeholder: adj, self.intrinsics_placeholder: self.intrinsics } depths_init = self.sess.run(self.outputs['depths'], feed_dict=feed_dict) self.depths = [depth for depth in depths_init] def update_poses(self, fixed=1, margin=3): """ Update the poses by executing the motion graph, fix first keyframe """ n_images = len(self.images) start_idx = max(self.keyframe_inds[0] - margin, 0) edges = [] for i in self.keyframe_inds: for j in range(start_idx, n_images): if (i != j) and (abs(i - j) <= self.window): edges.append((i, j)) edges = np.stack(edges, axis=0) - start_idx images = np.stack(self.images[start_idx:], axis=0) depths = np.stack(self.depths[start_idx:], axis=0) poses = np.stack(self.poses[start_idx:], axis=0) if not fixed: fixed = 0 feed_dict = { self.images_placeholder: images, self.depths_placeholder: depths, self.poses_placeholder: poses, self.edges_placeholder: edges, self.fixed_placeholder: np.int32(fixed), self.init_placeholder: False, self.intrinsics_placeholder: self.intrinsics } # execute pose subgraph poses = self.sess.run(self.outputs['poses'], feed_dict=feed_dict) # update the poses for j in range(poses.shape[0]): self.poses[start_idx + j] = poses[j] self.pose_cur = self.poses[-1] def update_depths(self, fixed=1, margin=3): """ Update the depths by executing the depth graph """ n_images = len(self.images) start_idx = max(self.keyframe_inds[0] - margin, 0) # faster if we batch multiple depth updates together inds = self.keyframe_inds if fixed and len(self.keyframe_inds) > 1: inds = inds[fixed:] # fix depth for first keyframe adj_list = [] for i in inds: adj_inds = [] for j in range(start_idx, n_images): if (i != j) and (abs(i - j) <= self.window): adj_inds.append(j) # make sure all adj lists are the same size if len(adj_inds) < 2 * self.window: adj_inds = np.random.choice(adj_inds, 2 * self.window, replace=True).tolist() adj_inds = [i] + adj_inds adj_list.append(np.array(adj_inds, dtype=np.int32)) adj_list = np.stack(adj_list, axis=0) - start_idx images = np.stack(self.images[start_idx:], axis=0) poses = np.stack(self.poses[start_idx:], axis=0) feed_dict = { self.images_placeholder: images, self.poses_placeholder: poses, self.adj_placeholder: adj_list, self.intrinsics_placeholder: self.intrinsics, } depths = self.sess.run(self.outputs['depths'], feed_dict=feed_dict) # update the keyframe depths for i, keyframe_index in enumerate(inds): self.depths[keyframe_index] = depths[i] def visualize_output(self, keyframe_index): """ Backproject a point cloud then add point cloud to visualization """ self.vis_counter += 1 keyframe_image = self.images[keyframe_index] keyframe_depth = self.depths[keyframe_index] keyframe_pose = self.poses[keyframe_index] feed_dict = { self.images_placeholder: keyframe_image[np.newaxis], self.depths_placeholder: keyframe_depth[np.newaxis], self.poses_placeholder: keyframe_pose[np.newaxis], self.intrinsics_placeholder: self.intrinsics } keyframe_point_cloud, keyframe_point_colors = \ self.sess.run(self.outputs['point_cloud'], feed_dict=feed_dict) pointcloud = (keyframe_point_cloud, keyframe_point_colors) # only add the point cloud once in every 5 frames if self.vis_counter % 4 == 0: self.queue.put((pointcloud, keyframe_pose)) else: self.queue.put((None, keyframe_pose)) def display_keyframes(self): """ display image / depth keyframe pairs """ if len(self.keyframe_inds) > 0: image_stack = [] for keyframe_index in self.keyframe_inds: keyframe_image = self.images[keyframe_index] keyframe_depth = self.depths[keyframe_index] image_and_depth = vis.create_image_depth_figure( keyframe_image, keyframe_depth) image_stack.append(image_and_depth) image_stack = np.concatenate(image_stack, axis=0) if len(self.keyframe_inds) > 1: image_stack = cv2.resize(image_stack, None, fx=0.5, fy=0.5) cv2.imshow('keyframes', image_stack / 255.0) cv2.waitKey(10) def track(self, image): """ track the new frame """ keyframe_image = self.images[self.keyframe_inds[-1]] images = np.stack([keyframe_image, image], axis=0) keyframe_pose = self.poses[self.keyframe_inds[-1]] poses = np.stack([keyframe_pose, self.pose_cur], axis=0) keyframe_depth = self.depths[self.keyframe_inds[-1]] depths = keyframe_depth[np.newaxis] edges = np.array([[0, 1]], dtype=np.int32) fixed = np.int32(0) feed_dict = { self.images_placeholder: images, self.depths_placeholder: depths, self.poses_placeholder: poses, self.edges_placeholder: edges, self.fixed_placeholder: fixed, self.init_placeholder: False, self.intrinsics_placeholder: self.intrinsics } updated_poses = self.sess.run(self.outputs['poses'], feed_dict=feed_dict) # relative pose between keyframe and new pose dP = np.matmul(updated_poses[1], np.linalg.inv(updated_poses[0])) # tracking probably lost, attempt recovery; sometimes caused by gaps between frames if pose_distance(dP) > 0.8: feed_dict = { self.images_placeholder: images, self.depths_placeholder: depths, self.poses_placeholder: poses, self.edges_placeholder: edges, self.fixed_placeholder: fixed, self.init_placeholder: True, self.intrinsics_placeholder: self.intrinsics } updated_poses = self.sess.run(self.outputs['poses'], feed_dict=feed_dict) dP = np.matmul(updated_poses[1], np.linalg.inv(updated_poses[0])) self.pose_cur = np.matmul(dP, keyframe_pose) return pose_distance(dP) def __call__(self, image, intrinsics=None): if intrinsics is not None: self.intrinsics = intrinsics ht, wd, _ = image.shape # get image dimensions did_make_new_keyframe = False if len(self.images) < 4: # tracking has not yet begun if self.index % self.rate == 0: self.images.append(image) self.depths.append(np.ones((ht, wd))) self.poses.append(np.eye(4)) # initialize the tracker ! if len(self.images) == 4: self.deepv2d_init() # set the keyframes self.keyframe_inds = np.random.randint(0, 4, self.n_keyframes) self.keyframe_inds = sorted(self.keyframe_inds.tolist()) for i in range(3): self.update_poses(fixed=False) self.update_depths(fixed=False) else: dist = self.track(image) if dist > 0.8: new_keyframe_index = len(self.images) - 1 query_pose = self.poses[new_keyframe_index] depth_new = self.reproject_depth(query_pose) self.depths[new_keyframe_index] = depth_new self.keyframe_inds.append(new_keyframe_index) if len(self.keyframe_inds) > self.n_keyframes: old_keyframe_index = self.keyframe_inds.pop(0) self.visualize_output(old_keyframe_index) self.update_poses(fixed=2) self.update_depths() if self.index % self.rate == 0 and (dist > 0.1): self.images.append(image) self.depths.append(np.ones((ht, wd))) self.poses.append(self.pose_cur) self.update_poses(fixed=2) self.update_depths() # make a new keyfrane if len(self.images) - self.keyframe_inds[-1] >= self.window: new_keyframe_index = self.keyframe_inds[-1] + 2 query_pose = self.poses[new_keyframe_index] depth_new = self.reproject_depth(query_pose) self.depths[new_keyframe_index] = depth_new self.keyframe_inds.append(new_keyframe_index) if len(self.keyframe_inds) > self.n_keyframes: old_keyframe_index = self.keyframe_inds.pop(0) self.visualize_output(old_keyframe_index) self.update_poses(fixed=2) self.update_depths() self.display_keyframes() self.index += 1
def build_train_graph_stage2(self, cfg, num_gpus=1): with tf.name_scope("training_schedule"): global_step = tf.Variable(0, name='global_step', trainable=False) gs = tf.to_float(global_step) if cfg.TRAIN.RENORM: rmax = tf.clip_by_value(5.0 * (gs / 2.5e4) + 1.0, 1.0, 5.0) # rmax schedule dmax = tf.clip_by_value(8.0 * (gs / 2.5e4), 0.0, 8.0) # dmax schedule rmin = 1.0 / rmax schedule = {'rmax': rmax, 'rmin': rmin, 'dmax': dmax} else: schedule = None LR_DECAY = int(0.8 * self.training_steps) lr = tf.train.exponential_decay(cfg.TRAIN.LR, global_step, LR_DECAY, 0.2, staircase=True) stereo_optim = tf.train.RMSPropOptimizer(lr) motion_optim = tf.train.RMSPropOptimizer(MOTION_LR_FRACTION * lr) id_batch, images_batch, poses_batch, gt_batch, filled_batch, pred_batch, intrinsics_batch = self.dl.next( ) images_batch = tf.split(images_batch, num_gpus) poses_batch = tf.split(poses_batch, num_gpus) gt_batch = tf.split(gt_batch, num_gpus) filled_batch = tf.split(filled_batch, num_gpus) pred_batch = tf.split(pred_batch, num_gpus) intrinsics_batch = tf.split(intrinsics_batch, num_gpus) tower_motion_grads = [] tower_stereo_grads = [] tower_predictions = [] tower_losses = [] write_ops = [] for gpu_id in range(num_gpus): motion_net = MotionNetwork(cfg.MOTION, reuse=gpu_id > 0) depth_net = DepthNetwork(cfg.STRUCTURE, schedule=schedule, reuse=gpu_id > 0) images = images_batch[gpu_id] poses = poses_batch[gpu_id] depth_gt = gt_batch[gpu_id] depth_filled = filled_batch[gpu_id] depth_pred = pred_batch[gpu_id] intrinsics = intrinsics_batch[gpu_id] Gs = VideoSE3Transformation(matrix=poses) batch, frames, height, width, _ = images.get_shape().as_list() with tf.name_scope("depth_input"): input_prob = tf.train.exponential_decay(2.0, global_step, LR_DECAY, 0.02, staircase=False) rnd = tf.random_uniform([], 0, 1) depth_input = tf.cond(rnd < input_prob, lambda: depth_filled, lambda: depth_pred) with tf.device('/gpu:%d' % gpu_id): # motion inference Ts, kvec = motion_net.forward(None, images, depth_input[:, tf.newaxis], intrinsics) stop_cond = global_step < cfg.TRAIN.GT_POSE_ITERS Ts = cond_transform(stop_cond, Ts.copy(stop_gradients=True), Ts) kvec = tf.cond(stop_cond, lambda: tf.stop_gradient(kvec), lambda: kvec) # depth inference depth_pr = depth_net.forward(Ts, images, kvec) depth_loss = depth_net.compute_loss(depth_gt, log_error=(gpu_id == 0)) motion_loss = motion_net.compute_loss(Gs, depth_filled[:, tf.newaxis], intrinsics, log_error=(gpu_id == 0)) # compute all gradients if 1: total_loss = cfg.TRAIN.DEPTH_WEIGHT * depth_loss + motion_loss var_list = tf.trainable_variables() grads = gradients(total_loss, var_list) # split backward pass else: motion_vars = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope="motion") stereo_vars = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope="stereo") so3, translation = Ts.so3, Ts.translation stereo_grads = gradients(depth_loss, [so3, translation] + stereo_vars) diff_so3, diff_translation, stereo_grads = \ stereo_grads[0], stereo_grads[1], stereo_grads[2:] motion_grads = tf.gradients( [motion_loss, so3, translation], motion_vars, grad_ys=[ tf.ones_like(motion_loss), diff_so3, diff_translation ]) grads = stereo_grads + motion_grads var_list = stereo_vars + motion_vars motion_gvs = [] stereo_gvs = [] for (g, v) in zip(grads, var_list): if 'stereo' in v.name and (g is not None): if cfg.TRAIN.CLIP_GRADS: g = tf.clip_by_value(g, -1.0, 1.0) stereo_gvs.append((g, v)) if 'motion' in v.name and (g is not None): if cfg.TRAIN.CLIP_GRADS and (g is not None): g = tf.clip_by_value(g, -1.0, 1.0) motion_gvs.append((g, v)) tower_motion_grads.append(motion_gvs) tower_stereo_grads.append(stereo_gvs) tower_predictions.append(depth_pr) tower_losses.append(depth_loss) if gpu_id == 0: self.total_loss = depth_loss # use last gpu to compute batch norm statistics update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) tower_motion_gvs = average_gradients(tower_motion_grads) tower_stereo_gvs = average_gradients(tower_stereo_grads) with tf.name_scope("train_op"): with tf.control_dependencies(update_ops): self.train_op = tf.group( stereo_optim.apply_gradients(tower_stereo_gvs), motion_optim.apply_gradients(tower_motion_gvs), tf.assign(global_step, global_step + 1)) self.write_op = self.dl.write(id_batch, tf.concat(tower_predictions, axis=0)) self.total_loss = tf.reduce_mean(tf.stack(tower_losses, axis=0)) tf.summary.scalar("total_loss", self.total_loss) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("input_prob", input_prob)
def build_train_graph_stage1(self, cfg, num_gpus=1): id_batch, images_batch, poses_batch, gt_batch, filled_batch, pred_batch, intrinsics_batch = self.dl.next( ) images_batch = tf.split(images_batch, num_gpus) poses_batch = tf.split(poses_batch, num_gpus) gt_batch = tf.split(gt_batch, num_gpus) filled_batch = tf.split(filled_batch, num_gpus) pred_batch = tf.split(pred_batch, num_gpus) intrinsics_batch = tf.split(intrinsics_batch, num_gpus) with tf.name_scope("training_schedule"): global_step = tf.Variable(0, name='global_step', trainable=False) lr = tf.train.exponential_decay(cfg.TRAIN.LR, global_step, 5000, 0.5, staircase=True) optim = tf.train.RMSPropOptimizer(MOTION_LR_FRACTION * lr) tower_grads = [] tower_losses = [] for gpu_id in range(num_gpus): images = images_batch[gpu_id] poses = poses_batch[gpu_id] depth_gt = gt_batch[gpu_id] depth_filled = filled_batch[gpu_id] depth_pred = pred_batch[gpu_id] intrinsics = intrinsics_batch[gpu_id] Gs = VideoSE3Transformation(matrix=poses) motion_net = MotionNetwork(cfg.MOTION, bn_is_training=True, reuse=gpu_id > 0) with tf.device('/gpu:%d' % gpu_id): depth_input = tf.expand_dims(depth_filled, 1) Ts, kvec = motion_net.forward(None, images, depth_input, intrinsics) total_loss = motion_net.compute_loss(Gs, depth_input, intrinsics, log_error=(gpu_id == 0)) tower_losses.append(total_loss) var_list = tf.trainable_variables() grads = gradients(total_loss, var_list) gvs = [] for (g, v) in zip(grads, var_list): if g is not None: if cfg.TRAIN.CLIP_GRADS: g = tf.clip_by_value(g, -1.0, 1.0) gvs.append((g, v)) gvs = zip(grads, var_list) tower_grads.append(gvs) # use last gpu to compute batch norm statistics update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.name_scope("train_op"): gvs = average_gradients(tower_grads) total_loss = tf.reduce_mean(tf.stack(tower_losses, axis=0)) with tf.control_dependencies(update_ops): self.train_op = optim.apply_gradients(gvs, global_step) self.write_op = None self.total_loss = total_loss tf.summary.scalar("learning_rate", lr) tf.summary.scalar("total_loss", total_loss)