Пример #1
0
    def _build_motion_graph(self):
        self.motion_net = MotionNetwork(self.cfg.MOTION,
                                        mode=self.mode,
                                        use_regressor=self.use_regressor,
                                        is_calibrated=self.is_calibrated,
                                        is_training=False)

        images = self.images_placeholder[tf.newaxis]
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]

        do_init = self.init_placeholder
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1)

        # convert pose matrix into SE3 object
        Ts = VideoSE3Transformation(matrix=poses)

        Ts, intrinsics = self.motion_net.forward(Ts,
                                                 images,
                                                 depths,
                                                 intrinsics,
                                                 edge_inds,
                                                 init=do_init)

        self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0)
        self.outputs['intrinsics'] = intrinsics[0]
        self.outputs['weights'] = self.motion_net.weights_history[-1]
Пример #2
0
    def _build_motion_graph(self):
        """ Motion graph updates poses using depth as input """

        self.motion_net = MotionNetwork(
            self.cfg.MOTION,
            mode='global',  # use global optimization mode
            is_training=False)

        images = self.images_placeholder[tf.newaxis]
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1)

        # convert pose matricies into SE3 object
        Ts = VideoSE3Transformation(matrix=poses)
        batch, num = Ts.shape()

        Ts, intrinsics = self.motion_net.forward(
            Ts,
            images,
            depths,
            intrinsics,
            inds=edge_inds,
            num_fixed=self.fixed_placeholder)

        # convert SE3 object back to matrix representation
        self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0)
        self.outputs['intrinsics'] = intrinsics
Пример #3
0
class DeepV2D:
    def __init__(self,
                 cfg,
                 ckpt,
                 is_calibrated=True,
                 use_fcrn=False,
                 use_regressor=True,
                 image_dims=None,
                 mode='keyframe'):

        self.cfg = cfg
        self.ckpt = ckpt
        self.mode = mode

        self.use_fcrn = use_fcrn
        self.use_regressor = use_regressor
        self.is_calibrated = is_calibrated

        if image_dims is not None:
            self.image_dims = image_dims
        else:
            if cfg.STRUCTURE.MODE == 'concat':
                self.image_dims = [
                    cfg.INPUT.FRAMES, cfg.INPUT.HEIGHT, cfg.INPUT.WIDTH
                ]
            else:
                self.image_dims = [None, cfg.INPUT.HEIGHT, cfg.INPUT.WIDTH]

        self.outputs = {}
        self._create_placeholders()
        self._build_motion_graph()
        self._build_depth_graph()
        self._build_reprojection_graph()
        self._build_visibility_graph()
        self._build_point_cloud_graph()

        self.depths = []
        self.poses = []

        if self.use_fcrn:
            self._build_fcrn_graph()

        self.saver = tf.train.Saver(tf.model_variables())

    def set_session(self, sess):
        self.sess = sess
        sess.run(tf.global_variables_initializer())
        self.saver.restore(self.sess, self.ckpt)

        if self.use_fcrn:
            fcrn_vars = {}
            for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="FCRN"):
                fcrn_vars[var.name.replace('FCRN/', '').replace(':0',
                                                                '')] = var

            fcrn_saver = tf.train.Saver(fcrn_vars)
            fcrn_saver.restore(sess, 'models/NYU_FCRN.ckpt')

    def _create_placeholders(self):
        frames, ht, wd = self.image_dims
        self.images_placeholder = tf.placeholder(tf.float32,
                                                 [frames, ht, wd, 3])
        if self.mode == 'keyframe':
            self.depths_placeholder = tf.placeholder(tf.float32, [1, ht, wd])
        else:
            self.depths_placeholder = tf.placeholder(tf.float32,
                                                     [frames, ht, wd])

        self.poses_placeholder = tf.placeholder(tf.float32, [frames, 4, 4])
        self.intrinsics_placeholder = tf.placeholder(tf.float32, [4])
        self.init_placeholder = tf.placeholder(tf.bool, [])

        # placeholders for storing graph adj_list and edges
        self.edges_placeholder = tf.placeholder(tf.int32, [None, 2])
        self.adj_placeholder = tf.placeholder(tf.int32, [None, None])

    def _build_motion_graph(self):
        self.motion_net = MotionNetwork(self.cfg.MOTION,
                                        mode=self.mode,
                                        use_regressor=self.use_regressor,
                                        is_calibrated=self.is_calibrated,
                                        is_training=False)

        images = self.images_placeholder[tf.newaxis]
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]

        do_init = self.init_placeholder
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1)

        # convert pose matrix into SE3 object
        Ts = VideoSE3Transformation(matrix=poses)

        Ts, intrinsics = self.motion_net.forward(Ts,
                                                 images,
                                                 depths,
                                                 intrinsics,
                                                 edge_inds,
                                                 init=do_init)

        self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0)
        self.outputs['intrinsics'] = intrinsics[0]
        self.outputs['weights'] = self.motion_net.weights_history[-1]

    def _build_depth_graph(self):
        self.depth_net = DepthNetwork(self.cfg.STRUCTURE, is_training=False)
        images = self.images_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        # convert pose matrix into SE3 object
        Ts = VideoSE3Transformation(matrix=poses)

        adj_list = None
        if self.mode == 'global':
            adj_list = self.adj_placeholder

        depths = self.depth_net.forward(Ts, images, intrinsics, adj_list)
        self.outputs['depths'] = depths

    def _build_point_cloud_graph(self):
        """Use poses and depth maps to create point cloud"""
        depths = self.depths_placeholder[tf.newaxis]
        images = self.images_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        depths_pad = tf.pad(depths, [[0, 0], [0, 0], [0, 1], [0, 1]],
                            "CONSTANT")

        depths_grad = \
            (depths_pad[:, :, 1:, :-1] - depths_pad[:, :, :-1, :-1])**2 + \
            (depths_pad[:, :, :-1, 1:] - depths_pad[:, :, :-1, :-1])**2

        # don't use large depths for point cloud and ignore boundary regions
        valid = (depths < 5.0) & (depths_grad < 0.01)

        # depths, intrinsics = rescale_depths_and_intrinsics(depths, intrinsics, downscale=4)
        batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4)

        ii, jj = tf.meshgrid(tf.range(1), tf.range(0, num))
        ii = tf.reshape(ii, [-1])
        jj = tf.reshape(jj, [-1])

        Ts = VideoSE3Transformation(matrix=poses)
        X0 = projective_ops.backproject(depths, intrinsics)

        # transform point cloud into coordinate system defined by first frame
        X1 = (Ts.gather(ii) * Ts.gather(jj).inv())(X0)

        crop_h = 12
        crop_w = 32

        X1 = X1[:, :, crop_h:-crop_h, crop_w:-crop_w]
        valid = valid[:, :, crop_h:-crop_h, crop_w:-crop_w]
        images = images[:, :, crop_h:-crop_h, crop_w:-crop_w, ::-1]

        X1 = tf.reshape(X1, [-1, 3])
        colors = tf.reshape(images, [-1, 3])

        valid_inds = tf.where(tf.reshape(valid, [-1]))
        valid_inds = tf.reshape(valid_inds, [-1])

        X1 = tf.gather(X1, valid_inds, axis=0)
        colors = tf.gather(colors, valid_inds, axis=0)

        self.outputs['point_cloud'] = (X1, colors)

    def _build_reprojection_graph(self):
        """ Used to project depth from keyframes onto new frame """

        EPS = 1e-8
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4)
        Ts = VideoSE3Transformation(matrix=poses)
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        ii, jj = tf.meshgrid(tf.range(0, num), tf.range(num, num + 1))
        ii = tf.reshape(ii, [-1])
        jj = tf.reshape(jj, [-1])

        Tij = Ts.gather(jj) * Ts.gather(ii).inv()
        X0 = projective_ops.backproject(depths, intrinsics)
        X1 = Tij(X0)

        coords = projective_ops.project(X1, intrinsics)
        depths = X1[..., 2]

        indicies = tf.cast(coords[..., ::-1] + .5, tf.int32)
        indicies = tf.reshape(indicies, [-1, 2])
        depths = tf.reshape(depths, [-1])

        depth = tf.scatter_nd(indicies, depths, [ht, wd])
        count = tf.scatter_nd(indicies, tf.ones_like(depths), [ht, wd])

        depth = depth / (count + EPS)
        self.outputs['depth_reprojection'] = depth

    def _build_visibility_graph(self):
        """ Find induced optical flow between pairs of frames """

        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        Ts = VideoSE3Transformation(matrix=poses)
        ii, jj = tf.unstack(self.edges_placeholder, num=2, axis=-1)
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        depths, intrinsics = rescale_depths_and_intrinsics(depths,
                                                           intrinsics,
                                                           downscale=4)
        ht = tf.cast(tf.shape(depths)[2], tf.float32)
        wd = tf.cast(tf.shape(depths)[3], tf.float32)

        depths = tf.gather(depths, ii, axis=1)
        Tij = Ts.gather(jj) * Ts.gather(ii).inv()

        flow = Tij.induced_flow(depths, intrinsics)
        coords = Tij.transform(depths, intrinsics)

        flo_graph = tf.sqrt(tf.reduce_sum(flow**2, axis=-1))
        flo_graph = tf.reduce_mean(flo_graph, [-1, -2])

        contained = tf.to_float((coords[..., 0] > 0.0) & (coords[..., 0] < wd)
                                & (coords[..., 1] > 0.0)
                                & (coords[..., 1] < ht))

        vis_graph = tf.reduce_mean(contained, [-1, -2])
        self.outputs['visibility'] = (flo_graph[0], vis_graph[0], flow)

    def _build_fcrn_graph(self):
        """ Build single image initializion graph"""
        images = self.images_placeholder
        batch, ht, wd, _ = tf.unstack(tf.shape(images), num=4)

        with tf.variable_scope("FCRN") as scope:
            # crop out boarder and flip color channels
            fcrn_input = tf.image.resize_area(images[:, 4:-4, 6:-6, ::-1],
                                              [228, 304])
            net = fcrn.ResNet50UpProj({'data': fcrn_input}, batch, 1, False)
            fcrn_output = tf.stop_gradient(net.get_output())
            fcrn_output = tf.image.resize_bilinear(fcrn_output, [ht, wd])

        self.outputs['fcrn'] = tf.squeeze(fcrn_output, -1)

    def compute_visibility_matrix(self):
        """ Computes a matrix of optical flow and visibility between all pairs of frames 
        Ex. flo_matrix[i,j] is the mean optical flow between camera i and camera j
        Ex. vis_matrix[i,j] is the portion of points in camera i visibile in camera j """

        num = len(self.images)
        ii, jj = np.meshgrid(np.arange(num), np.arange(num))

        ii = np.reshape(ii, [-1])
        jj = np.reshape(jj, [-1])
        edges = np.stack([jj, ii], axis=-1)

        feed_dict = {
            self.depths_placeholder: self.depths,
            self.poses_placeholder: self.poses,
            self.edges_placeholder: edges,
            self.intrinsics_placeholder: self.intrinsics
        }

        flo_graph, vis_graph, flow = self.sess.run(self.outputs['visibility'],
                                                   feed_dict=feed_dict)
        flo_matrix = flo_graph.reshape(num, num)
        vis_matrix = vis_graph.reshape(num, num)
        return flo_matrix, vis_matrix, flow

    def reproject_depth(self, query_pose):
        """ Use depth estimates and poses to estimate depth map at a new camera location """
        poses = np.concatenate([self.poses, query_pose[np.newaxis]], axis=0)
        feed_dict = {
            self.depths_placeholder: self.depths,
            self.poses_placeholder: poses,
            self.intrinsics_placeholder: self.intrinsics
        }

        depth = self.sess.run(self.outputs['depth_reprojection'],
                              feed_dict=feed_dict)
        return fill_depth(depth)

    def deepv2d_init(self):
        if self.use_fcrn:
            if self.mode == 'keyframe':
                feed_dict = {self.images_placeholder: self.images[[0]]}
            else:
                feed_dict = {self.images_placeholder: self.images}

            self.depths = self.sess.run(self.outputs['fcrn'],
                                        feed_dict=feed_dict)

        else:
            if self.mode == 'keyframe':
                images = np.stack([self.images[0]] * self.images.shape[0],
                                  axis=0)
                poses = np.stack([np.eye(4)] * self.images.shape[0], axis=0)

                feed_dict = {
                    self.images_placeholder: images,
                    self.poses_placeholder: poses,
                    self.intrinsics_placeholder: self.intrinsics
                }

            else:
                ii = np.arange(self.images.shape[0])
                adj = np.stack([ii, ii], axis=-1)

                feed_dict = {
                    self.images_placeholder: self.images,
                    self.poses_placeholder: self.poses,
                    self.adj_placeholder: adj,
                    self.intrinsics_placeholder: self.intrinsics
                }

            self.depths = self.sess.run(self.outputs['depths'],
                                        feed_dict=feed_dict)

    def update_poses(self, itr=0):
        n = self.images.shape[0]

        if self.mode == 'keyframe':
            ii, jj = np.meshgrid(np.arange(1), np.arange(1, n))
        else:
            ii, jj = np.meshgrid(np.arange(n), np.arange(n))

        ii = ii.reshape(-1)
        jj = jj.reshape(-1)
        v = ~np.equal(ii, jj)

        # don't use pairs with self loop
        edges = np.stack([ii[v], jj[v]], axis=-1)

        feed_dict = {
            self.images_placeholder: self.images,
            self.depths_placeholder: self.depths,
            self.poses_placeholder: self.poses,
            self.edges_placeholder: edges,
            self.init_placeholder: (itr == 0),
            self.intrinsics_placeholder: self.intrinsics
        }

        # execute pose subgraph
        outputs = [
            self.outputs['poses'], self.outputs['intrinsics'],
            self.outputs['weights']
        ]
        self.poses, self.intrinsics, self.weights = self.sess.run(
            outputs, feed_dict=feed_dict)

        if not self.cfg.MOTION.IS_CALIBRATED:
            print("intrinsics (fx, fy, cx, cy): ", self.intrinsics)

    def update_depths(self, itr=0):
        n = self.images.shape[0]
        inds_list = []

        if self.mode == 'keyframe':
            feed_dict = {
                self.images_placeholder: self.images,
                self.poses_placeholder: self.poses,
                self.intrinsics_placeholder: self.intrinsics
            }

            self.depths = self.sess.run(self.outputs['depths'],
                                        feed_dict=feed_dict)

        else:
            for i in range(n):
                inds = np.arange(n).tolist()
                inds.remove(i)
                inds = [i] + inds
                inds_list.append(inds)

            adj_list = np.array(inds_list, dtype=np.int32)

            if n <= 4:
                feed_dict = {
                    self.images_placeholder: self.images,
                    self.poses_placeholder: self.poses,
                    self.adj_placeholder: adj_list,
                    self.intrinsics_placeholder: self.intrinsics
                }

                self.depths = self.sess.run(self.outputs['depths'],
                                            feed_dict=feed_dict)

            else:  # we need to split up inference to fit in memory
                s = 2
                for i in range(0, n, s):
                    feed_dict = {
                        self.images_placeholder: self.images,
                        self.poses_placeholder: self.poses,
                        self.adj_placeholder: adj_list[i:i + s],
                        self.intrinsics_placeholder: self.intrinsics
                    }

                    self.depths[i:i + s] = self.sess.run(
                        self.outputs['depths'], feed_dict=feed_dict)

    def vizualize_output(self, inds=[0]):
        feed_dict = {
            self.images_placeholder: self.images,
            self.depths_placeholder: self.depths,
            self.poses_placeholder: self.poses,
            self.intrinsics_placeholder: self.intrinsics
        }

        keyframe_image = self.images[0]
        keyframe_depth = self.depths[0]

        image_depth = vis.create_image_depth_figure(keyframe_image,
                                                    keyframe_depth)
        cv2.imwrite('depth.png', image_depth[:, image_depth.shape[1] // 2:])
        cv2.imshow('image_depth', image_depth / 255.0)

        print("Press any key to cotinue")
        cv2.waitKey()

        # use depth map to create point cloud
        point_cloud, point_colors = self.sess.run(self.outputs['point_cloud'],
                                                  feed_dict=feed_dict)

        print("Press q to exit")
        vis.visualize_prediction(point_cloud, point_colors, self.poses)

    def __call__(self, images, intrinsics=None, iters=5, viz=False):
        n_frames = len(images)
        self.images = np.stack(images, axis=0)

        if intrinsics is None:
            # initialize intrinsics
            fx = images.shape[2] * 1.2
            fy = images.shape[2] * 1.2
            cx = images.shape[2] / 2.0
            cy = images.shape[1] / 2.0
            intrinsics = np.stack([fx, fy, cx, cy])

        # (fx, cy, cx, cy)
        self.intrinsics = intrinsics

        poses = np.eye(4).reshape(1, 4, 4)
        poses = np.tile(poses, [n_frames, 1, 1])
        self.poses = poses

        # initalize reconstruction
        self.deepv2d_init()

        for i in range(iters):
            self.update_poses(i)
            self.update_depths()

        if viz:
            self.vizualize_output()

        return self.depths, self.poses
Пример #4
0
class DeepV2DSLAM:
    def __init__(self,
                 cfg,
                 ckpt,
                 n_keyframes=1,
                 rate=2,
                 use_fcrn=True,
                 viz=True,
                 mode='global',
                 image_dims=[None, 480, 640]):

        self.cfg = cfg
        self.ckpt = ckpt

        self.viz = viz
        self.mode = mode
        self.use_fcrn = use_fcrn
        self.image_dims = image_dims

        self.index = 0
        self.keyframe_inds = []

        self.images = []
        self.depths = []
        self.poses = []

        # tracking config parameters
        self.n_keyframes = n_keyframes  # number of keyframes to use
        self.rate = rate  # how often to sample new frames
        self.window = 3  # add edges if frames are within distance

        # build tensorflow graphs
        self.outputs = {}
        self._create_placeholders()
        self._build_motion_graph()
        self._build_depth_graph()
        self._build_reprojection_graph()
        self._build_visibility_graph()
        self._build_point_cloud_graph()

        if self.use_fcrn:
            self._build_fcrn_graph()

        self.saver = tf.train.Saver(tf.model_variables())

    def set_session(self, sess):
        self.sess = sess
        self.saver.restore(self.sess, self.ckpt)

        if self.use_fcrn:
            fcrn_vars = {}
            for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="FCRN"):
                fcrn_vars[var.name.replace('FCRN/', '').replace(':0',
                                                                '')] = var

            fcrn_saver = tf.train.Saver(fcrn_vars)
            fcrn_saver.restore(sess, 'models/NYU_FCRN.ckpt')

    def start_visualization(self,
                            cinematic=False,
                            render_path=None,
                            clear_points=False):
        """ Start interactive slam visualization in seperate process """

        # new points and poses get added to the queue
        self.queue = Queue()
        self.vis_counter = 0

        self.viz = vis.InteractiveViz(self.queue, cinematic, render_path,
                                      clear_points)
        self.viz.start()

    def _create_placeholders(self):
        frames, ht, wd = self.image_dims
        self.images_placeholder = tf.placeholder(tf.float32,
                                                 [frames, ht, wd, 3])
        if self.mode == 'keyframe':
            self.depths_placeholder = tf.placeholder(tf.float32, [1, ht, wd])
        else:
            self.depths_placeholder = tf.placeholder(tf.float32,
                                                     [frames, ht, wd])

        self.poses_placeholder = tf.placeholder(tf.float32, [frames, 4, 4])
        self.intrinsics_placeholder = tf.placeholder(tf.float32, [4])

        # placeholders for storing graph adj_list and edges
        self.edges_placeholder = tf.placeholder(tf.int32, [None, 2])
        self.adj_placeholder = tf.placeholder(tf.int32, [None, None])
        self.fixed_placeholder = tf.placeholder(tf.int32, [])
        self.init_placeholder = tf.placeholder(tf.bool, [])

    def _build_motion_graph(self):
        """ Motion graph updates poses using depth as input """

        self.motion_net = MotionNetwork(
            self.cfg.MOTION,
            mode='global',  # use global optimization mode
            is_training=False)

        images = self.images_placeholder[tf.newaxis]
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        edge_inds = tf.unstack(self.edges_placeholder, num=2, axis=-1)

        # convert pose matricies into SE3 object
        Ts = VideoSE3Transformation(matrix=poses)
        batch, num = Ts.shape()

        Ts, intrinsics = self.motion_net.forward(
            Ts,
            images,
            depths,
            intrinsics,
            inds=edge_inds,
            num_fixed=self.fixed_placeholder)

        # convert SE3 object back to matrix representation
        self.outputs['poses'] = tf.squeeze(Ts.matrix(), 0)
        self.outputs['intrinsics'] = intrinsics

    def _build_depth_graph(self):
        """ Depth graph updates depth using poses as input """
        self.depth_net = DepthNetwork(self.cfg.STRUCTURE, is_training=False)
        images = self.images_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        Ts = VideoSE3Transformation(matrix=poses)

        adj_list = None
        if self.mode == 'global':
            adj_list = self.adj_placeholder

        depths = self.depth_net.forward(Ts, images, intrinsics, adj_list)
        self.outputs['depths'] = depths

    def _build_visibility_graph(self):
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        Ts = VideoSE3Transformation(matrix=poses)
        ii, jj = tf.unstack(self.edges_placeholder, num=2, axis=-1)
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        depths, intrinsics = rescale_depths_and_intrinsics(depths,
                                                           intrinsics,
                                                           downscale=4)
        ht = tf.cast(tf.shape(depths)[2], tf.float32)
        wd = tf.cast(tf.shape(depths)[3], tf.float32)

        depths = tf.gather(depths, ii, axis=1)
        Tij = Ts.gather(jj) * Ts.gather(ii).inv()

        flow = Tij.induced_flow(depths, intrinsics)
        coords = Tij.transform(depths, intrinsics)

        # translation only
        rotation_mask = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0]
        flow_translation = Tij.induced_flow(depths, intrinsics)

        flo_graph = tf.sqrt(tf.reduce_sum(flow**2, axis=-1))
        flo_graph = tf.reduce_mean(flo_graph, [-1, -2])

        pos_graph = tf.sqrt(tf.reduce_sum(flow_translation**2, axis=-1))
        pos_graph = tf.reduce_mean(pos_graph, [-1, -2])

        contained = tf.to_float((coords[..., 0] > 0.0) & (coords[..., 0] < wd)
                                & (coords[..., 1] > 0.0)
                                & (coords[..., 1] < ht))

        vis_graph = tf.reduce_mean(contained, [-1, -2])
        self.outputs['visibility'] = (flo_graph[0], vis_graph[0])

    def _build_fcrn_graph(self):
        """ Build single image initializion graph"""
        images = self.images_placeholder
        batch, ht, wd, _ = tf.unstack(tf.shape(images), num=4)

        with tf.variable_scope("FCRN") as scope:
            # crop out boarder and flip color channels
            fcrn_input = tf.image.resize_area(images[:, 4:-4, 6:-6, ::-1],
                                              [228, 304])
            net = fcrn.ResNet50UpProj({'data': fcrn_input}, batch, 1, False)
            fcrn_output = tf.stop_gradient(net.get_output())
            fcrn_output = tf.image.resize_bilinear(fcrn_output, [ht, wd])

        self.outputs['fcrn'] = tf.squeeze(fcrn_output, -1)

    def compute_visibility_graph(self, edges=None):
        """ Computes a matrix of optical flow and visibility between all pairs of frames 
        Ex. flo_matrix[i,j] is the mean optical flow between camera i and camera j
        Ex. vis_matrix[i,j] is the portion of points in camera i visibile in camera j """

        vis_matrix = False
        if edges is None:
            num = len(self.keyframe_images)
            vis_matrix = True
            ii, jj = np.meshgrid(np.arange(num), np.arange(num))

            ii = np.reshape(ii, [-1])
            jj = np.reshape(jj, [-1])
            edges = np.stack([jj, ii], axis=-1)

        feed_dict = {
            self.depths_placeholder: np.stack(self.keyframe_depths, axis=0),
            self.poses_placeholder: np.stack(self.keyframe_poses, axis=0),
            self.edges_placeholder: edges,
            self.intrinsics_placeholder: self.intrinsics
        }

        flo_graph, pos_graph = self.sess.run(self.outputs['visibility'],
                                             feed_dict=feed_dict)
        if vis_matrix:
            flo_matrix = flo_graph.reshape(num, num)
            pos_matrix = pos_graph.reshape(num, num)
            return flo_matrix, pos_matrix

        return flo_graph, pos_matrix

    def _build_point_cloud_graph(self):
        """Use poses and depth maps to create point cloud"""
        depths = self.depths_placeholder[tf.newaxis]
        images = self.images_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        depths_pad = tf.pad(depths, [[0, 0], [0, 0], [0, 1], [0, 1]],
                            "CONSTANT")

        depths_grad = \
            (depths_pad[:, :, 1:, :-1] - depths_pad[:, :, :-1, :-1])**2 + \
            (depths_pad[:, :, :-1, 1:] - depths_pad[:, :, :-1, :-1])**2

        # don't use large depths for point cloud and ignore boundary regions
        valid = (depths < 6.0) & (depths_grad < 0.05)

        batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4)
        Ts = VideoSE3Transformation(matrix=poses)
        X0 = projective_ops.backproject(depths, intrinsics)

        # transform point cloud into world coordinaes
        X1 = Ts.inv()(X0)

        crop_h0 = 20
        crop_h1 = 12

        crop_w = 32

        X1 = X1[:, :, crop_h0:-crop_h1, crop_w:-crop_w]
        valid = valid[:, :, crop_h0:-crop_h1, crop_w:-crop_w]
        images = images[:, :, crop_h0:-crop_h1, crop_w:-crop_w, ::-1]

        X1 = tf.reshape(X1, [-1, 3])
        colors = tf.reshape(images, [-1, 3])

        valid_inds = tf.where(tf.reshape(valid, [-1]))
        valid_inds = tf.reshape(valid_inds, [-1])

        X1 = tf.gather(X1, valid_inds, axis=0)
        colors = tf.gather(colors, valid_inds, axis=0)

        self.outputs['point_cloud'] = (X1, colors)

    def _build_reprojection_graph(self):
        """ Used to project depth from keyframes onto new frame """
        EPS = 1e-8
        depths = self.depths_placeholder[tf.newaxis]
        poses = self.poses_placeholder[tf.newaxis]
        intrinsics = self.intrinsics_placeholder[tf.newaxis]

        batch, num, ht, wd = tf.unstack(tf.shape(depths), num=4)
        Ts = VideoSE3Transformation(matrix=poses)
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        ii, jj = tf.meshgrid(tf.range(0, num), tf.range(num, num + 1))
        ii = tf.reshape(ii, [-1])
        jj = tf.reshape(jj, [-1])

        Tij = Ts.gather(jj) * Ts.gather(ii).inv()
        X0 = projective_ops.backproject(depths, intrinsics)
        X1 = Tij(X0)

        coords = projective_ops.project(X1, intrinsics)
        depths = X1[..., 2]

        indicies = tf.cast(coords[..., ::-1] + .5, tf.int32)
        indicies = tf.reshape(indicies, [-1, 2])
        depths = tf.reshape(depths, [-1])

        depth = tf.scatter_nd(indicies, depths, [ht, wd])
        count = tf.scatter_nd(indicies, tf.ones_like(depths), [ht, wd])

        depth = depth / (count + EPS)
        self.outputs['depth_reprojection'] = depth

    def reproject_depth(self, query_pose, margin=2):
        """ Use depth estimates and poses to estimate depth map at a new camera location """

        keyframe_pose = self.poses[self.keyframe_inds[-1]]
        poses = np.stack([keyframe_pose, query_pose], axis=0)

        keyframe_depth = self.depths[self.keyframe_inds[-1]]
        depths = keyframe_depth[np.newaxis]

        feed_dict = {
            self.depths_placeholder: depths,
            self.poses_placeholder: poses,
            self.intrinsics_placeholder: self.intrinsics
        }

        depth = self.sess.run(self.outputs['depth_reprojection'],
                              feed_dict=feed_dict)
        return fill_depth(depth)

    def deepv2d_init(self):
        if self.use_fcrn:
            feed_dict = {
                self.images_placeholder: np.stack(self.images, axis=0)
            }
            depths_init = self.sess.run(self.outputs['fcrn'],
                                        feed_dict=feed_dict)

        else:
            ii = np.arange(len(self.images))
            adj = np.stack([ii, ii], axis=-1)

            feed_dict = {
                self.images_placeholder: np.stack(self.images, axis=0),
                self.poses_placeholder: np.stack(self.poses, axis=0),
                self.adj_placeholder: adj,
                self.intrinsics_placeholder: self.intrinsics
            }

            depths_init = self.sess.run(self.outputs['depths'],
                                        feed_dict=feed_dict)

        self.depths = [depth for depth in depths_init]

    def update_poses(self, fixed=1, margin=3):
        """ Update the poses by executing the motion graph, fix first keyframe """

        n_images = len(self.images)
        start_idx = max(self.keyframe_inds[0] - margin, 0)

        edges = []
        for i in self.keyframe_inds:
            for j in range(start_idx, n_images):
                if (i != j) and (abs(i - j) <= self.window):
                    edges.append((i, j))

        edges = np.stack(edges, axis=0) - start_idx
        images = np.stack(self.images[start_idx:], axis=0)
        depths = np.stack(self.depths[start_idx:], axis=0)
        poses = np.stack(self.poses[start_idx:], axis=0)

        if not fixed:
            fixed = 0

        feed_dict = {
            self.images_placeholder: images,
            self.depths_placeholder: depths,
            self.poses_placeholder: poses,
            self.edges_placeholder: edges,
            self.fixed_placeholder: np.int32(fixed),
            self.init_placeholder: False,
            self.intrinsics_placeholder: self.intrinsics
        }

        # execute pose subgraph
        poses = self.sess.run(self.outputs['poses'], feed_dict=feed_dict)

        # update the poses
        for j in range(poses.shape[0]):
            self.poses[start_idx + j] = poses[j]

        self.pose_cur = self.poses[-1]

    def update_depths(self, fixed=1, margin=3):
        """ Update the depths by executing the depth graph """

        n_images = len(self.images)
        start_idx = max(self.keyframe_inds[0] - margin, 0)

        # faster if we batch multiple depth updates together
        inds = self.keyframe_inds
        if fixed and len(self.keyframe_inds) > 1:
            inds = inds[fixed:]  # fix depth for first keyframe

        adj_list = []
        for i in inds:
            adj_inds = []
            for j in range(start_idx, n_images):
                if (i != j) and (abs(i - j) <= self.window):
                    adj_inds.append(j)

            # make sure all adj lists are the same size
            if len(adj_inds) < 2 * self.window:
                adj_inds = np.random.choice(adj_inds,
                                            2 * self.window,
                                            replace=True).tolist()

            adj_inds = [i] + adj_inds
            adj_list.append(np.array(adj_inds, dtype=np.int32))

        adj_list = np.stack(adj_list, axis=0) - start_idx
        images = np.stack(self.images[start_idx:], axis=0)
        poses = np.stack(self.poses[start_idx:], axis=0)

        feed_dict = {
            self.images_placeholder: images,
            self.poses_placeholder: poses,
            self.adj_placeholder: adj_list,
            self.intrinsics_placeholder: self.intrinsics,
        }

        depths = self.sess.run(self.outputs['depths'], feed_dict=feed_dict)

        # update the keyframe depths
        for i, keyframe_index in enumerate(inds):
            self.depths[keyframe_index] = depths[i]

    def visualize_output(self, keyframe_index):
        """ Backproject a point cloud then add point cloud to visualization """

        self.vis_counter += 1
        keyframe_image = self.images[keyframe_index]
        keyframe_depth = self.depths[keyframe_index]
        keyframe_pose = self.poses[keyframe_index]

        feed_dict = {
            self.images_placeholder: keyframe_image[np.newaxis],
            self.depths_placeholder: keyframe_depth[np.newaxis],
            self.poses_placeholder: keyframe_pose[np.newaxis],
            self.intrinsics_placeholder: self.intrinsics
        }

        keyframe_point_cloud, keyframe_point_colors = \
            self.sess.run(self.outputs['point_cloud'], feed_dict=feed_dict)

        pointcloud = (keyframe_point_cloud, keyframe_point_colors)

        # only add the point cloud once in every 5 frames
        if self.vis_counter % 4 == 0:
            self.queue.put((pointcloud, keyframe_pose))

        else:
            self.queue.put((None, keyframe_pose))

    def display_keyframes(self):
        """ display image / depth keyframe pairs """

        if len(self.keyframe_inds) > 0:
            image_stack = []
            for keyframe_index in self.keyframe_inds:
                keyframe_image = self.images[keyframe_index]
                keyframe_depth = self.depths[keyframe_index]

                image_and_depth = vis.create_image_depth_figure(
                    keyframe_image, keyframe_depth)
                image_stack.append(image_and_depth)

            image_stack = np.concatenate(image_stack, axis=0)
            if len(self.keyframe_inds) > 1:
                image_stack = cv2.resize(image_stack, None, fx=0.5, fy=0.5)

            cv2.imshow('keyframes', image_stack / 255.0)
            cv2.waitKey(10)

    def track(self, image):
        """ track the new frame """

        keyframe_image = self.images[self.keyframe_inds[-1]]
        images = np.stack([keyframe_image, image], axis=0)

        keyframe_pose = self.poses[self.keyframe_inds[-1]]
        poses = np.stack([keyframe_pose, self.pose_cur], axis=0)

        keyframe_depth = self.depths[self.keyframe_inds[-1]]
        depths = keyframe_depth[np.newaxis]

        edges = np.array([[0, 1]], dtype=np.int32)
        fixed = np.int32(0)

        feed_dict = {
            self.images_placeholder: images,
            self.depths_placeholder: depths,
            self.poses_placeholder: poses,
            self.edges_placeholder: edges,
            self.fixed_placeholder: fixed,
            self.init_placeholder: False,
            self.intrinsics_placeholder: self.intrinsics
        }

        updated_poses = self.sess.run(self.outputs['poses'],
                                      feed_dict=feed_dict)

        # relative pose between keyframe and new pose
        dP = np.matmul(updated_poses[1], np.linalg.inv(updated_poses[0]))

        # tracking probably lost, attempt recovery; sometimes caused by gaps between frames
        if pose_distance(dP) > 0.8:
            feed_dict = {
                self.images_placeholder: images,
                self.depths_placeholder: depths,
                self.poses_placeholder: poses,
                self.edges_placeholder: edges,
                self.fixed_placeholder: fixed,
                self.init_placeholder: True,
                self.intrinsics_placeholder: self.intrinsics
            }

            updated_poses = self.sess.run(self.outputs['poses'],
                                          feed_dict=feed_dict)
            dP = np.matmul(updated_poses[1], np.linalg.inv(updated_poses[0]))

        self.pose_cur = np.matmul(dP, keyframe_pose)
        return pose_distance(dP)

    def __call__(self, image, intrinsics=None):

        if intrinsics is not None:
            self.intrinsics = intrinsics

        ht, wd, _ = image.shape  # get image dimensions
        did_make_new_keyframe = False

        if len(self.images) < 4:  # tracking has not yet begun
            if self.index % self.rate == 0:
                self.images.append(image)
                self.depths.append(np.ones((ht, wd)))
                self.poses.append(np.eye(4))

            # initialize the tracker !
            if len(self.images) == 4:
                self.deepv2d_init()

                # set the keyframes
                self.keyframe_inds = np.random.randint(0, 4, self.n_keyframes)
                self.keyframe_inds = sorted(self.keyframe_inds.tolist())

                for i in range(3):
                    self.update_poses(fixed=False)
                    self.update_depths(fixed=False)

        else:
            dist = self.track(image)

            if dist > 0.8:
                new_keyframe_index = len(self.images) - 1
                query_pose = self.poses[new_keyframe_index]

                depth_new = self.reproject_depth(query_pose)
                self.depths[new_keyframe_index] = depth_new

                self.keyframe_inds.append(new_keyframe_index)
                if len(self.keyframe_inds) > self.n_keyframes:
                    old_keyframe_index = self.keyframe_inds.pop(0)
                    self.visualize_output(old_keyframe_index)

                self.update_poses(fixed=2)
                self.update_depths()

            if self.index % self.rate == 0 and (dist > 0.1):
                self.images.append(image)
                self.depths.append(np.ones((ht, wd)))
                self.poses.append(self.pose_cur)

                self.update_poses(fixed=2)
                self.update_depths()

            # make a new keyfrane
            if len(self.images) - self.keyframe_inds[-1] >= self.window:
                new_keyframe_index = self.keyframe_inds[-1] + 2
                query_pose = self.poses[new_keyframe_index]

                depth_new = self.reproject_depth(query_pose)
                self.depths[new_keyframe_index] = depth_new

                self.keyframe_inds.append(new_keyframe_index)
                if len(self.keyframe_inds) > self.n_keyframes:
                    old_keyframe_index = self.keyframe_inds.pop(0)
                    self.visualize_output(old_keyframe_index)

                self.update_poses(fixed=2)
                self.update_depths()

        self.display_keyframes()
        self.index += 1
Пример #5
0
    def build_train_graph_stage2(self, cfg, num_gpus=1):

        with tf.name_scope("training_schedule"):
            global_step = tf.Variable(0, name='global_step', trainable=False)
            gs = tf.to_float(global_step)
            if cfg.TRAIN.RENORM:
                rmax = tf.clip_by_value(5.0 * (gs / 2.5e4) + 1.0, 1.0,
                                        5.0)  # rmax schedule
                dmax = tf.clip_by_value(8.0 * (gs / 2.5e4), 0.0,
                                        8.0)  # dmax schedule
                rmin = 1.0 / rmax
                schedule = {'rmax': rmax, 'rmin': rmin, 'dmax': dmax}
            else:
                schedule = None

            LR_DECAY = int(0.8 * self.training_steps)
            lr = tf.train.exponential_decay(cfg.TRAIN.LR,
                                            global_step,
                                            LR_DECAY,
                                            0.2,
                                            staircase=True)

            stereo_optim = tf.train.RMSPropOptimizer(lr)
            motion_optim = tf.train.RMSPropOptimizer(MOTION_LR_FRACTION * lr)

        id_batch, images_batch, poses_batch, gt_batch, filled_batch, pred_batch, intrinsics_batch = self.dl.next(
        )
        images_batch = tf.split(images_batch, num_gpus)
        poses_batch = tf.split(poses_batch, num_gpus)
        gt_batch = tf.split(gt_batch, num_gpus)
        filled_batch = tf.split(filled_batch, num_gpus)
        pred_batch = tf.split(pred_batch, num_gpus)
        intrinsics_batch = tf.split(intrinsics_batch, num_gpus)

        tower_motion_grads = []
        tower_stereo_grads = []
        tower_predictions = []
        tower_losses = []
        write_ops = []

        for gpu_id in range(num_gpus):
            motion_net = MotionNetwork(cfg.MOTION, reuse=gpu_id > 0)
            depth_net = DepthNetwork(cfg.STRUCTURE,
                                     schedule=schedule,
                                     reuse=gpu_id > 0)

            images = images_batch[gpu_id]
            poses = poses_batch[gpu_id]
            depth_gt = gt_batch[gpu_id]
            depth_filled = filled_batch[gpu_id]
            depth_pred = pred_batch[gpu_id]
            intrinsics = intrinsics_batch[gpu_id]

            Gs = VideoSE3Transformation(matrix=poses)
            batch, frames, height, width, _ = images.get_shape().as_list()

            with tf.name_scope("depth_input"):
                input_prob = tf.train.exponential_decay(2.0,
                                                        global_step,
                                                        LR_DECAY,
                                                        0.02,
                                                        staircase=False)
                rnd = tf.random_uniform([], 0, 1)
                depth_input = tf.cond(rnd < input_prob, lambda: depth_filled,
                                      lambda: depth_pred)

            with tf.device('/gpu:%d' % gpu_id):

                # motion inference
                Ts, kvec = motion_net.forward(None, images,
                                              depth_input[:, tf.newaxis],
                                              intrinsics)

                stop_cond = global_step < cfg.TRAIN.GT_POSE_ITERS
                Ts = cond_transform(stop_cond, Ts.copy(stop_gradients=True),
                                    Ts)
                kvec = tf.cond(stop_cond, lambda: tf.stop_gradient(kvec),
                               lambda: kvec)

                # depth inference
                depth_pr = depth_net.forward(Ts, images, kvec)

                depth_loss = depth_net.compute_loss(depth_gt,
                                                    log_error=(gpu_id == 0))
                motion_loss = motion_net.compute_loss(Gs,
                                                      depth_filled[:,
                                                                   tf.newaxis],
                                                      intrinsics,
                                                      log_error=(gpu_id == 0))

                # compute all gradients
                if 1:
                    total_loss = cfg.TRAIN.DEPTH_WEIGHT * depth_loss + motion_loss
                    var_list = tf.trainable_variables()
                    grads = gradients(total_loss, var_list)

                # split backward pass
                else:
                    motion_vars = tf.get_collection(
                        tf.GraphKeys.MODEL_VARIABLES, scope="motion")
                    stereo_vars = tf.get_collection(
                        tf.GraphKeys.MODEL_VARIABLES, scope="stereo")

                    so3, translation = Ts.so3, Ts.translation
                    stereo_grads = gradients(depth_loss,
                                             [so3, translation] + stereo_vars)
                    diff_so3, diff_translation, stereo_grads = \
                        stereo_grads[0], stereo_grads[1], stereo_grads[2:]

                    motion_grads = tf.gradients(
                        [motion_loss, so3, translation],
                        motion_vars,
                        grad_ys=[
                            tf.ones_like(motion_loss), diff_so3,
                            diff_translation
                        ])

                    grads = stereo_grads + motion_grads
                    var_list = stereo_vars + motion_vars

                motion_gvs = []
                stereo_gvs = []

                for (g, v) in zip(grads, var_list):
                    if 'stereo' in v.name and (g is not None):
                        if cfg.TRAIN.CLIP_GRADS:
                            g = tf.clip_by_value(g, -1.0, 1.0)
                        stereo_gvs.append((g, v))

                    if 'motion' in v.name and (g is not None):
                        if cfg.TRAIN.CLIP_GRADS and (g is not None):
                            g = tf.clip_by_value(g, -1.0, 1.0)
                        motion_gvs.append((g, v))

                tower_motion_grads.append(motion_gvs)
                tower_stereo_grads.append(stereo_gvs)

                tower_predictions.append(depth_pr)
                tower_losses.append(depth_loss)

                if gpu_id == 0:
                    self.total_loss = depth_loss

                # use last gpu to compute batch norm statistics
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        tower_motion_gvs = average_gradients(tower_motion_grads)
        tower_stereo_gvs = average_gradients(tower_stereo_grads)

        with tf.name_scope("train_op"):
            with tf.control_dependencies(update_ops):
                self.train_op = tf.group(
                    stereo_optim.apply_gradients(tower_stereo_gvs),
                    motion_optim.apply_gradients(tower_motion_gvs),
                    tf.assign(global_step, global_step + 1))

        self.write_op = self.dl.write(id_batch,
                                      tf.concat(tower_predictions, axis=0))
        self.total_loss = tf.reduce_mean(tf.stack(tower_losses, axis=0))

        tf.summary.scalar("total_loss", self.total_loss)
        tf.summary.scalar("learning_rate", lr)
        tf.summary.scalar("input_prob", input_prob)
Пример #6
0
    def build_train_graph_stage1(self, cfg, num_gpus=1):

        id_batch, images_batch, poses_batch, gt_batch, filled_batch, pred_batch, intrinsics_batch = self.dl.next(
        )
        images_batch = tf.split(images_batch, num_gpus)
        poses_batch = tf.split(poses_batch, num_gpus)
        gt_batch = tf.split(gt_batch, num_gpus)
        filled_batch = tf.split(filled_batch, num_gpus)
        pred_batch = tf.split(pred_batch, num_gpus)
        intrinsics_batch = tf.split(intrinsics_batch, num_gpus)

        with tf.name_scope("training_schedule"):
            global_step = tf.Variable(0, name='global_step', trainable=False)
            lr = tf.train.exponential_decay(cfg.TRAIN.LR,
                                            global_step,
                                            5000,
                                            0.5,
                                            staircase=True)
            optim = tf.train.RMSPropOptimizer(MOTION_LR_FRACTION * lr)

        tower_grads = []
        tower_losses = []

        for gpu_id in range(num_gpus):
            images = images_batch[gpu_id]
            poses = poses_batch[gpu_id]
            depth_gt = gt_batch[gpu_id]
            depth_filled = filled_batch[gpu_id]
            depth_pred = pred_batch[gpu_id]
            intrinsics = intrinsics_batch[gpu_id]

            Gs = VideoSE3Transformation(matrix=poses)
            motion_net = MotionNetwork(cfg.MOTION,
                                       bn_is_training=True,
                                       reuse=gpu_id > 0)

            with tf.device('/gpu:%d' % gpu_id):

                depth_input = tf.expand_dims(depth_filled, 1)
                Ts, kvec = motion_net.forward(None, images, depth_input,
                                              intrinsics)

                total_loss = motion_net.compute_loss(Gs,
                                                     depth_input,
                                                     intrinsics,
                                                     log_error=(gpu_id == 0))
                tower_losses.append(total_loss)

                var_list = tf.trainable_variables()
                grads = gradients(total_loss, var_list)

                gvs = []
                for (g, v) in zip(grads, var_list):
                    if g is not None:
                        if cfg.TRAIN.CLIP_GRADS:
                            g = tf.clip_by_value(g, -1.0, 1.0)
                        gvs.append((g, v))

                gvs = zip(grads, var_list)
                tower_grads.append(gvs)

                # use last gpu to compute batch norm statistics
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.name_scope("train_op"):
            gvs = average_gradients(tower_grads)
            total_loss = tf.reduce_mean(tf.stack(tower_losses, axis=0))

            with tf.control_dependencies(update_ops):
                self.train_op = optim.apply_gradients(gvs, global_step)

            self.write_op = None
            self.total_loss = total_loss
            tf.summary.scalar("learning_rate", lr)
            tf.summary.scalar("total_loss", total_loss)