Пример #1
0
  def build_loss(self):
    """Adds ops for computing loss."""
    with tf.name_scope('compute_loss'):
      self.reconstr_loss = 0
      self.smooth_loss = 0
      self.ssim_loss = 0
      self.icp_transform_loss = 0
      self.icp_residual_loss = 0

      # self.images is organized by ...[scale][B, h, w, seq_len * 3].
      self.images = [None for _ in range(NUM_SCALES)]
      # Following nested lists are organized by ...[scale][source-target].
      self.warped_image = [{} for _ in range(NUM_SCALES)]
      self.warp_mask = [{} for _ in range(NUM_SCALES)]
      self.warp_error = [{} for _ in range(NUM_SCALES)]
      self.ssim_error = [{} for _ in range(NUM_SCALES)]
      self.icp_transform = [{} for _ in range(NUM_SCALES)]
      self.icp_residual = [{} for _ in range(NUM_SCALES)]

      self.middle_frame_index = util.get_seq_middle(self.seq_length)

      # Compute losses at each scale.
      for s in range(NUM_SCALES):
        # Scale image stack.
        if s == 0:  # Just as a precaution. TF often has interpolation bugs.
          self.images[s] = self.image_stack
        else:
          height_s = int(self.img_height / (2**s))
          width_s = int(self.img_width / (2**s))
          self.images[s] = tf.image.resize_bilinear(
              self.image_stack, [height_s, width_s], align_corners=True)

        # Smoothness.
        if self.smooth_weight > 0:
          for i in range(self.seq_length):
            # When computing minimum loss, use the depth map from the middle
            # frame only.
            if not self.compute_minimum_loss or i == self.middle_frame_index:
              disp_smoothing = self.disp[i][s]
              if self.depth_normalization:
                # Perform depth normalization, dividing by the mean.
                mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3],
                                           keep_dims=True)
                disp_input = disp_smoothing / mean_disp
              else:
                disp_input = disp_smoothing
              scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s))
              self.smooth_loss += scaling_f * self.depth_smoothness(
                  disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)])

        self.debug_all_warped_image_batches = []
        for i in range(self.seq_length):
          for j in range(self.seq_length):
            if i == j:
              continue

            # When computing minimum loss, only consider the middle frame as
            # target.
            if self.compute_minimum_loss and j != self.middle_frame_index:
              continue
            # We only consider adjacent frames, unless either
            # compute_minimum_loss is on (where the middle frame is matched with
            # all other frames) or exhaustive_mode is on (where all frames are
            # matched with each other).
            if (not self.compute_minimum_loss and not self.exhaustive_mode and
                abs(i - j) != 1):
              continue

            selected_scale = 0 if self.depth_upsampling else s
            source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)]
            target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)]

            if self.depth_upsampling:
              target_depth = self.depth_upsampled[j][s]
            else:
              target_depth = self.depth[j][s]

            key = '%d-%d' % (i, j)

            if self.handle_motion:
              # self.seg_stack of shape (B, H, W, 9).
              # target_depth corresponds to middle frame, of shape (B, H, W, 1).

              # Now incorporate the other warping results, performed according
              # to the object motion network's predictions.
              # self.object_masks batch_size elements of (N, H, W, 9).
              # self.object_masks_warped batch_size elements of (N, H, W, 9).
              # self.object_transforms batch_size elements of (N, 2, 6).
              self.all_batches = []
              for batch_s in range(self.batch_size):
                # To warp i into j, first take the base warping (this is the
                # full image i warped into j using only the egomotion estimate).
                base_warping = self.warped_seq[s][i][batch_s]
                transform_matrices_thisbatch = tf.map_fn(
                    lambda transform: project.get_transform_mat(
                        tf.expand_dims(transform, axis=0), i, j)[0],
                    self.object_transforms[0][batch_s])

                def inverse_warp_wrapper(matrix):
                  """Wrapper for inverse warping method."""
                  warp_image, _ = (
                      project.inverse_warp(
                          tf.expand_dims(base_warping, axis=0),
                          tf.expand_dims(target_depth[batch_s], axis=0),
                          tf.expand_dims(matrix, axis=0),
                          tf.expand_dims(self.intrinsic_mat[
                              batch_s, selected_scale, :, :], axis=0),
                          tf.expand_dims(self.intrinsic_mat_inv[
                              batch_s, selected_scale, :, :], axis=0)))
                  return warp_image
                warped_images_thisbatch = tf.map_fn(
                    inverse_warp_wrapper, transform_matrices_thisbatch,
                    dtype=tf.float32)
                warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :]
                # warped_images_thisbatch is now of shape (N, H, W, 9).

                # Combine warped frames into a single one, using the object
                # masks. Result should be (1, 128, 416, 3).
                # Essentially, we here want to sum them all up, filtered by the
                # respective object masks.
                mask_base_valid_source = tf.equal(
                    self.seg_stack[batch_s, :, :, i*3:(i+1)*3],
                    tf.constant(0, dtype=tf.uint8))
                mask_base_valid_target = tf.equal(
                    self.seg_stack[batch_s, :, :, j*3:(j+1)*3],
                    tf.constant(0, dtype=tf.uint8))
                mask_valid = tf.logical_and(
                    mask_base_valid_source, mask_base_valid_target)
                self.base_warping = base_warping * tf.to_float(mask_valid)
                background = tf.expand_dims(self.base_warping, axis=0)
                def construct_const_filter_tensor(obj_id):
                  return tf.fill(
                      dims=[self.img_height, self.img_width, 3],
                      value=tf.sign(obj_id)) * tf.to_float(
                          tf.equal(self.seg_stack[batch_s, :, :, 3:6],
                                   tf.cast(obj_id, dtype=tf.uint8)))
                filter_tensor = tf.map_fn(
                    construct_const_filter_tensor,
                    tf.to_float(self.object_ids[s][batch_s]))
                filter_tensor = tf.stack(filter_tensor, axis=0)
                objects_to_add = tf.reduce_sum(
                    tf.multiply(warped_images_thisbatch, filter_tensor),
                    axis=0, keepdims=True)
                combined = background + objects_to_add
                self.all_batches.append(combined)
               # Now of shape (B, 128, 416, 3).
              self.warped_image[s][key] = tf.concat(self.all_batches, axis=0)

            else:
              # Don't handle motion, classic model formulation.
              egomotion_mat_i_j = project.get_transform_mat(
                  self.egomotion, i, j)
              # Inverse warp the source image to the target image frame for
              # photometric consistency loss.
              self.warped_image[s][key], self.warp_mask[s][key] = (
                  project.inverse_warp(
                      source,
                      target_depth,
                      egomotion_mat_i_j,
                      self.intrinsic_mat[:, selected_scale, :, :],
                      self.intrinsic_mat_inv[:, selected_scale, :, :]))

            # Reconstruction loss.
            self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
            if not self.compute_minimum_loss:
              self.reconstr_loss += tf.reduce_mean(
                  self.warp_error[s][key] * self.warp_mask[s][key])
            # SSIM.
            if self.ssim_weight > 0:
              self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
                                                  target)
              # TODO(rezama): This should be min_pool2d().
              if not self.compute_minimum_loss:
                ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1,
                                            'VALID')
                self.ssim_loss += tf.reduce_mean(
                    self.ssim_error[s][key] * ssim_mask)

        # If the minimum loss should be computed, the loss calculation has been
        # postponed until here.
        if self.compute_minimum_loss:
          for frame_index in range(self.middle_frame_index):
            key1 = '%d-%d' % (frame_index, self.middle_frame_index)
            key2 = '%d-%d' % (self.seq_length - frame_index - 1,
                              self.middle_frame_index)
            logging.info('computing min error between %s and %s', key1, key2)
            min_error = tf.minimum(self.warp_error[s][key1],
                                   self.warp_error[s][key2])
            self.reconstr_loss += tf.reduce_mean(min_error)
            if self.ssim_weight > 0:  # Also compute the minimum SSIM loss.
              min_error_ssim = tf.minimum(self.ssim_error[s][key1],
                                          self.ssim_error[s][key2])
              self.ssim_loss += tf.reduce_mean(min_error_ssim)

      # Build the total loss as composed of L1 reconstruction, SSIM, smoothing
      # and object size constraint loss as appropriate.
      self.reconstr_loss *= self.reconstr_weight
      self.total_loss = self.reconstr_loss
      if self.smooth_weight > 0:
        self.smooth_loss *= self.smooth_weight
        self.total_loss += self.smooth_loss
      if self.ssim_weight > 0:
        self.ssim_loss *= self.ssim_weight
        self.total_loss += self.ssim_loss
      if self.size_constraint_weight > 0:
        self.inf_loss *= self.size_constraint_weight
        self.total_loss += self.inf_loss
Пример #2
0
  def build_loss(self):
    """Adds ops for computing loss."""
    with tf.name_scope('compute_loss'):
      self.reconstr_loss = 0
      self.smooth_loss = 0
      self.ssim_loss = 0
      self.icp_transform_loss = 0
      self.icp_residual_loss = 0

      # self.images is organized by ...[scale][B, h, w, seq_len * 3].
      self.images = [None for _ in range(NUM_SCALES)]
      # Following nested lists are organized by ...[scale][source-target].
      self.warped_image = [{} for _ in range(NUM_SCALES)]
      self.warp_mask = [{} for _ in range(NUM_SCALES)]
      self.warp_error = [{} for _ in range(NUM_SCALES)]
      self.ssim_error = [{} for _ in range(NUM_SCALES)]
      self.icp_transform = [{} for _ in range(NUM_SCALES)]
      self.icp_residual = [{} for _ in range(NUM_SCALES)]

      self.middle_frame_index = util.get_seq_middle(self.seq_length)

      # Compute losses at each scale.
      for s in range(NUM_SCALES):
        # Scale image stack.
        if s == 0:  # Just as a precaution. TF often has interpolation bugs.
          self.images[s] = self.image_stack
        else:
          height_s = int(self.img_height / (2**s))
          width_s = int(self.img_width / (2**s))
          self.images[s] = tf.image.resize_bilinear(
              self.image_stack, [height_s, width_s], align_corners=True)

        # Smoothness.
        if self.smooth_weight > 0:
          for i in range(self.seq_length):
            # When computing minimum loss, use the depth map from the middle
            # frame only.
            if not self.compute_minimum_loss or i == self.middle_frame_index:
              disp_smoothing = self.disp[i][s]
              if self.depth_normalization:
                # Perform depth normalization, dividing by the mean.
                mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3],
                                           keep_dims=True)
                disp_input = disp_smoothing / mean_disp
              else:
                disp_input = disp_smoothing
              scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s))
              self.smooth_loss += scaling_f * self.depth_smoothness(
                  disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)])

        self.debug_all_warped_image_batches = []
        for i in range(self.seq_length):
          for j in range(self.seq_length):
            if i == j:
              continue

            # When computing minimum loss, only consider the middle frame as
            # target.
            if self.compute_minimum_loss and j != self.middle_frame_index:
              continue
            # We only consider adjacent frames, unless either
            # compute_minimum_loss is on (where the middle frame is matched with
            # all other frames) or exhaustive_mode is on (where all frames are
            # matched with each other).
            if (not self.compute_minimum_loss and not self.exhaustive_mode and
                abs(i - j) != 1):
              continue

            selected_scale = 0 if self.depth_upsampling else s
            source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)]
            target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)]

            if self.depth_upsampling:
              target_depth = self.depth_upsampled[j][s]
            else:
              target_depth = self.depth[j][s]

            key = '%d-%d' % (i, j)

            if self.handle_motion:
              # self.seg_stack of shape (B, H, W, 9).
              # target_depth corresponds to middle frame, of shape (B, H, W, 1).

              # Now incorporate the other warping results, performed according
              # to the object motion network's predictions.
              # self.object_masks batch_size elements of (N, H, W, 9).
              # self.object_masks_warped batch_size elements of (N, H, W, 9).
              # self.object_transforms batch_size elements of (N, 2, 6).
              self.all_batches = []
              for batch_s in range(self.batch_size):
                # To warp i into j, first take the base warping (this is the
                # full image i warped into j using only the egomotion estimate).
                base_warping = self.warped_seq[s][i][batch_s]
                transform_matrices_thisbatch = tf.map_fn(
                    lambda transform: project.get_transform_mat(
                        tf.expand_dims(transform, axis=0), i, j)[0],
                    self.object_transforms[0][batch_s])

                def inverse_warp_wrapper(matrix):
                  """Wrapper for inverse warping method."""
                  warp_image, _ = (
                      project.inverse_warp(
                          tf.expand_dims(base_warping, axis=0),
                          tf.expand_dims(target_depth[batch_s], axis=0),
                          tf.expand_dims(matrix, axis=0),
                          tf.expand_dims(self.intrinsic_mat[
                              batch_s, selected_scale, :, :], axis=0),
                          tf.expand_dims(self.intrinsic_mat_inv[
                              batch_s, selected_scale, :, :], axis=0)))
                  return warp_image
                warped_images_thisbatch = tf.map_fn(
                    inverse_warp_wrapper, transform_matrices_thisbatch,
                    dtype=tf.float32)
                warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :]
                # warped_images_thisbatch is now of shape (N, H, W, 9).

                # Combine warped frames into a single one, using the object
                # masks. Result should be (1, 128, 416, 3).
                # Essentially, we here want to sum them all up, filtered by the
                # respective object masks.
                mask_base_valid_source = tf.equal(
                    self.seg_stack[batch_s, :, :, i*3:(i+1)*3],
                    tf.constant(0, dtype=tf.uint8))
                mask_base_valid_target = tf.equal(
                    self.seg_stack[batch_s, :, :, j*3:(j+1)*3],
                    tf.constant(0, dtype=tf.uint8))
                mask_valid = tf.logical_and(
                    mask_base_valid_source, mask_base_valid_target)
                self.base_warping = base_warping * tf.to_float(mask_valid)
                background = tf.expand_dims(self.base_warping, axis=0)
                def construct_const_filter_tensor(obj_id):
                  return tf.fill(
                      dims=[self.img_height, self.img_width, 3],
                      value=tf.sign(obj_id)) * tf.to_float(
                          tf.equal(self.seg_stack[batch_s, :, :, 3:6],
                                   tf.cast(obj_id, dtype=tf.uint8)))
                filter_tensor = tf.map_fn(
                    construct_const_filter_tensor,
                    tf.to_float(self.object_ids[s][batch_s]))
                filter_tensor = tf.stack(filter_tensor, axis=0)
                objects_to_add = tf.reduce_sum(
                    tf.multiply(warped_images_thisbatch, filter_tensor),
                    axis=0, keepdims=True)
                combined = background + objects_to_add
                self.all_batches.append(combined)
               # Now of shape (B, 128, 416, 3).
              self.warped_image[s][key] = tf.concat(self.all_batches, axis=0)

            else:
              # Don't handle motion, classic model formulation.
              egomotion_mat_i_j = project.get_transform_mat(
                  self.egomotion, i, j)
              # Inverse warp the source image to the target image frame for
              # photometric consistency loss.
              self.warped_image[s][key], self.warp_mask[s][key] = (
                  project.inverse_warp(
                      source,
                      target_depth,
                      egomotion_mat_i_j,
                      self.intrinsic_mat[:, selected_scale, :, :],
                      self.intrinsic_mat_inv[:, selected_scale, :, :]))

            # Reconstruction loss.
            self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
            if not self.compute_minimum_loss:
              self.reconstr_loss += tf.reduce_mean(
                  self.warp_error[s][key] * self.warp_mask[s][key])
            # SSIM.
            if self.ssim_weight > 0:
              self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
                                                  target)
              # TODO(rezama): This should be min_pool2d().
              if not self.compute_minimum_loss:
                ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1,
                                            'VALID')
                self.ssim_loss += tf.reduce_mean(
                    self.ssim_error[s][key] * ssim_mask)

        # If the minimum loss should be computed, the loss calculation has been
        # postponed until here.
        if self.compute_minimum_loss:
          for frame_index in range(self.middle_frame_index):
            key1 = '%d-%d' % (frame_index, self.middle_frame_index)
            key2 = '%d-%d' % (self.seq_length - frame_index - 1,
                              self.middle_frame_index)
            logging.info('computing min error between %s and %s', key1, key2)
            min_error = tf.minimum(self.warp_error[s][key1],
                                   self.warp_error[s][key2])
            self.reconstr_loss += tf.reduce_mean(min_error)
            if self.ssim_weight > 0:  # Also compute the minimum SSIM loss.
              min_error_ssim = tf.minimum(self.ssim_error[s][key1],
                                          self.ssim_error[s][key2])
              self.ssim_loss += tf.reduce_mean(min_error_ssim)

      # Build the total loss as composed of L1 reconstruction, SSIM, smoothing
      # and object size constraint loss as appropriate.
      self.reconstr_loss *= self.reconstr_weight
      self.total_loss = self.reconstr_loss
      if self.smooth_weight > 0:
        self.smooth_loss *= self.smooth_weight
        self.total_loss += self.smooth_loss
      if self.ssim_weight > 0:
        self.ssim_loss *= self.ssim_weight
        self.total_loss += self.ssim_loss
      if self.size_constraint_weight > 0:
        self.inf_loss *= self.size_constraint_weight
        self.total_loss += self.inf_loss
Пример #3
0
    def build_loss(self):
        """Adds ops for computing loss."""
        with tf.name_scope('compute_loss'):
            self.reconstr_loss = 0
            self.smooth_loss = 0
            self.ssim_loss = 0
            self.icp_transform_loss = 0
            self.icp_residual_loss = 0

            # self.images is organized by ...[scale][B, h, w, seq_len * 3].
            self.images = [{} for _ in range(NUM_SCALES)]
            # Following nested lists are organized by ...[scale][source-target].
            self.warped_image = [{} for _ in range(NUM_SCALES)]
            self.warp_mask = [{} for _ in range(NUM_SCALES)]
            self.warp_error = [{} for _ in range(NUM_SCALES)]
            self.ssim_error = [{} for _ in range(NUM_SCALES)]
            self.icp_transform = [{} for _ in range(NUM_SCALES)]
            self.icp_residual = [{} for _ in range(NUM_SCALES)]

            self.middle_frame_index = util.get_seq_middle(self.seq_length)

            # Compute losses at each scale.
            for s in range(NUM_SCALES):
                # Scale image stack.
                height_s = int(self.img_height / (2**s))
                width_s = int(self.img_width / (2**s))
                self.images[s] = tf.image.resize_area(self.image_stack,
                                                      [height_s, width_s])

                # Smoothness.
                if self.smooth_weight > 0:
                    for i in range(self.seq_length):
                        # In legacy mode, use the depth map from the middle frame only.
                        if not self.legacy_mode or i == self.middle_frame_index:
                            self.smooth_loss += 1.0 / (
                                2**s) * self.depth_smoothness(
                                    self.disp[i][s],
                                    self.images[s][:, :, :, 3 * i:3 * (i + 1)])

                for i in range(self.seq_length):
                    for j in range(self.seq_length):
                        # Only consider adjacent frames.
                        if i == j or abs(i - j) != 1:
                            continue
                        # In legacy mode, only consider the middle frame as target.
                        if self.legacy_mode and j != self.middle_frame_index:
                            continue
                        source = self.images[s][:, :, :, 3 * i:3 * (i + 1)]
                        target = self.images[s][:, :, :, 3 * j:3 * (j + 1)]
                        target_depth = self.depth[j][s]
                        key = '%d-%d' % (i, j)

                        # Extract ego-motion from i to j
                        egomotion_index = min(i, j)
                        egomotion_mult = 1
                        if i > j:
                            # Need to inverse egomotion when going back in sequence.
                            egomotion_mult *= -1
                        # For compatiblity with SfMLearner, interpret all egomotion vectors
                        # as pointing toward the middle frame.  Note that unlike SfMLearner,
                        # each vector captures the motion to/from its next frame, and not
                        # the center frame.  Although with seq_length == 3, there is no
                        # difference.
                        if self.legacy_mode:
                            if egomotion_index >= self.middle_frame_index:
                                egomotion_mult *= -1
                        egomotion = egomotion_mult * self.egomotion[:,
                                                                    egomotion_index, :]

                        # Inverse warp the source image to the target image frame for
                        # photometric consistency loss.
                        self.warped_image[s][key], self.warp_mask[s][key] = (
                            project.inverse_warp(
                                source, target_depth, egomotion,
                                self.intrinsic_mat[:, s, :, :],
                                self.intrinsic_mat_inv[:, s, :, :]))

                        # Reconstruction loss.
                        self.warp_error[s][key] = tf.abs(
                            self.warped_image[s][key] - target)
                        self.reconstr_loss += tf.reduce_mean(
                            self.warp_error[s][key] * self.warp_mask[s][key])
                        # SSIM.
                        if self.ssim_weight > 0:
                            self.ssim_error[s][key] = self.ssim(
                                self.warped_image[s][key], target)
                            # TODO(rezama): This should be min_pool2d().
                            ssim_mask = slim.avg_pool2d(
                                self.warp_mask[s][key], 3, 1, 'VALID')
                            self.ssim_loss += tf.reduce_mean(
                                self.ssim_error[s][key] * ssim_mask)
                        # 3D loss.
                        if self.icp_weight > 0:
                            cloud_a = self.cloud[j][s]
                            cloud_b = self.cloud[i][s]
                            self.icp_transform[s][key], self.icp_residual[s][
                                key] = icp(cloud_a, egomotion, cloud_b)
                            self.icp_transform_loss += 1.0 / (
                                2**s) * tf.reduce_mean(
                                    tf.abs(self.icp_transform[s][key]))
                            self.icp_residual_loss += 1.0 / (
                                2**s) * tf.reduce_mean(
                                    tf.abs(self.icp_residual[s][key]))

            self.total_loss = self.reconstr_weight * self.reconstr_loss
            if self.smooth_weight > 0:
                self.total_loss += self.smooth_weight * self.smooth_loss
            if self.ssim_weight > 0:
                self.total_loss += self.ssim_weight * self.ssim_loss
            if self.icp_weight > 0:
                self.total_loss += self.icp_weight * (self.icp_transform_loss +
                                                      self.icp_residual_loss)
Пример #4
0
def egomotion_net(image_stack, is_training=True, legacy_mode=False):
    """Predict ego-motion vectors from a stack of frames.
    Args:
    image_stack: Input tensor with shape [B, h, w, seq_length * 3].  Regardless
        of the value of legacy_mode, the input image sequence passed to the
        function should be in normal order, e.g. [1, 2, 3].
    is_training: Whether the model is being trained or not.
    legacy_mode: Setting legacy_mode to True enables compatibility with
        SfMLearner checkpoints.  When legacy_mode is on, egomotion_net()
        rearranges the input tensor to place the target (middle) frame first in
        sequence.  This is the arrangement of inputs that legacy models have
        received during training.  In legacy mode, the client program
        (model.Model.build_loss()) interprets the outputs of this network
        differently as well.  For example:
        When legacy_mode == True,
        Network inputs will be [2, 1, 3]
        Network outputs will be [1 -> 2, 3 -> 2]
        When legacy_mode == False,
        Network inputs will be [1, 2, 3]
        Network outputs will be [1 -> 2, 2 -> 3]
    Returns:
    Egomotion vectors with shape [B, seq_length - 1, 6].
    """
    seq_length = image_stack.get_shape()[3].value // 3  # 3 == RGB.
    if legacy_mode:
        # Put the target frame at the beginning of stack.
        with tf.name_scope('rearrange_stack'):
            mid_index = util.get_seq_middle(seq_length)
            left_subset = image_stack[:, :, :, :mid_index * 3]
            target_frame = image_stack[:, :, :,
                                       mid_index * 3:(mid_index + 1) * 3]
            right_subset = image_stack[:, :, :, (mid_index + 1) * 3:]
            image_stack = tf.concat([target_frame, left_subset, right_subset],
                                    axis=3)

    batch_norm_params = {'is_training': is_training}
    num_egomotion_vecs = seq_length - 1
    with tf.variable_scope('pose_exp_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        normalizer_fn = slim.batch_norm if True else None
        normalizer_params = batch_norm_params if True else None
        with slim.arg_scope(
            [slim.conv2d, slim.conv2d_transpose],
                normalizer_fn=normalizer_fn,
                weights_regularizer=slim.l2_regularizer(WEIGHT_REG),
                normalizer_params=normalizer_params,
                activation_fn=tf.nn.relu,
                outputs_collections=end_points_collection):
            cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
            cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
            cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
            cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
            cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

    # Ego-motion specific layers
        with tf.variable_scope('pose'):
            cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6')
            cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
            pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
            egomotion_pred = slim.conv2d(cnv7,
                                         pred_channels, [1, 1],
                                         scope='pred',
                                         stride=1,
                                         normalizer_fn=None,
                                         activation_fn=None)
            egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
            # Tinghui found that scaling by a small constant facilitates training.
            egomotion_final = 0.01 * tf.reshape(
                egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])

        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)
        return egomotion_final, end_points
Пример #5
0
def region_deformer_net(image_stack, disp_bottleneck_stack, joint_encoder,
                        seq_length, weight_reg, trans_params_size=32,
                        region_deformer_scaling=1.0):
    """Predict region deformer parameters from a stack of frames or embeddings.

    Args:
      image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
      disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
          seq_length * c_hidden] in order.
      joint_encoder: Determines if the same encoder is used for computing the
          bottleneck layer of both the egomotion and the depth prediction
          network. If enabled, disp_bottleneck_stack is used as input, and the
          encoding steps are skipped. If disabled, a separate encoder is defined
          on image_stack.
      seq_length: The sequence length used.
      weight_reg: The amount of weight regularization.
      trans_params_size: Number of parameters of region deformer,
          32 for bicubic function
      region_deformer_scaling: scaling factor for output

    Returns:
      Transformation parameters with shape [B, seq_length - 1, 32].
    """
    # Rearrange the image_stack as [1, 0, 2], output will be [1 -> 0, 1 -> 2]
    with tf.name_scope('rearrange_stack'):
        mid_index = util.get_seq_middle(seq_length)
        left_subset = image_stack[:, :, :, :mid_index * 3]
        target_frame = image_stack[:, :, :, mid_index * 3:(mid_index + 1) * 3]
        right_subset = image_stack[:, :, :, (mid_index + 1) * 3:]
        image_stack = tf.concat([target_frame, left_subset, right_subset], axis=3)

    num_transforms = seq_length - 1
    with tf.variable_scope('region_deformer_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(weight_reg),
                            normalizer_params=None,
                            activation_fn=tf.nn.relu,
                            outputs_collections=end_points_collection):
            if not joint_encoder:
                # Define separate encoder. If sharing, we can skip the encoding step,
                # as the bottleneck layer will already be passed as input.
                cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
                cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
                cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
                cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
                cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

            with tf.variable_scope('region_deformer'):
                inputs = disp_bottleneck_stack if joint_encoder else cnv5
                cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
                cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
                pred_channels = trans_params_size * num_transforms
                trans_params_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
                                                stride=1, normalizer_fn=None,
                                                activation_fn=None)
                trans_params_avg = tf.reduce_mean(trans_params_pred, [1, 2])
                trans_params_res = tf.reshape(
                    trans_params_avg, [-1, num_transforms, trans_params_size])

                trans_params_scaled = region_deformer_scaling * trans_params_res

        return trans_params_scaled
Пример #6
0
  def build_loss(self):
    """Adds ops for computing loss."""
    with tf.name_scope('compute_loss'):
      self.reconstr_loss = 0
      self.smooth_loss = 0
      self.ssim_loss = 0
      self.icp_transform_loss = 0
      self.icp_residual_loss = 0

      # self.images is organized by ...[scale][B, h, w, seq_len * 3].
      self.images = [{} for _ in range(NUM_SCALES)]
      # Following nested lists are organized by ...[scale][source-target].
      self.warped_image = [{} for _ in range(NUM_SCALES)]
      self.warp_mask = [{} for _ in range(NUM_SCALES)]
      self.warp_error = [{} for _ in range(NUM_SCALES)]
      self.ssim_error = [{} for _ in range(NUM_SCALES)]
      self.icp_transform = [{} for _ in range(NUM_SCALES)]
      self.icp_residual = [{} for _ in range(NUM_SCALES)]

      self.middle_frame_index = util.get_seq_middle(self.seq_length)

      # Compute losses at each scale.
      for s in range(NUM_SCALES):
        # Scale image stack.
        height_s = int(self.img_height / (2**s))
        width_s = int(self.img_width / (2**s))
        self.images[s] = tf.image.resize_area(self.image_stack,
                                              [height_s, width_s])

        # Smoothness.
        if self.smooth_weight > 0:
          for i in range(self.seq_length):
            # In legacy mode, use the depth map from the middle frame only.
            if not self.legacy_mode or i == self.middle_frame_index:
              self.smooth_loss += 1.0 / (2**s) * self.depth_smoothness(
                  self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)])

        for i in range(self.seq_length):
          for j in range(self.seq_length):
            # Only consider adjacent frames.
            if i == j or abs(i - j) != 1:
              continue
            # In legacy mode, only consider the middle frame as target.
            if self.legacy_mode and j != self.middle_frame_index:
              continue
            source = self.images[s][:, :, :, 3 * i:3 * (i + 1)]
            target = self.images[s][:, :, :, 3 * j:3 * (j + 1)]
            target_depth = self.depth[j][s]
            key = '%d-%d' % (i, j)

            # Extract ego-motion from i to j
            egomotion_index = min(i, j)
            egomotion_mult = 1
            if i > j:
              # Need to inverse egomotion when going back in sequence.
              egomotion_mult *= -1
            # For compatiblity with SfMLearner, interpret all egomotion vectors
            # as pointing toward the middle frame.  Note that unlike SfMLearner,
            # each vector captures the motion to/from its next frame, and not
            # the center frame.  Although with seq_length == 3, there is no
            # difference.
            if self.legacy_mode:
              if egomotion_index >= self.middle_frame_index:
                egomotion_mult *= -1
            egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :]

            # Inverse warp the source image to the target image frame for
            # photometric consistency loss.
            self.warped_image[s][key], self.warp_mask[s][key] = (
                project.inverse_warp(source,
                                     target_depth,
                                     egomotion,
                                     self.intrinsic_mat[:, s, :, :],
                                     self.intrinsic_mat_inv[:, s, :, :]))

            # Reconstruction loss.
            self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
            self.reconstr_loss += tf.reduce_mean(
                self.warp_error[s][key] * self.warp_mask[s][key])
            # SSIM.
            if self.ssim_weight > 0:
              self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
                                                  target)
              # TODO(rezama): This should be min_pool2d().
              ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID')
              self.ssim_loss += tf.reduce_mean(
                  self.ssim_error[s][key] * ssim_mask)
            # 3D loss.
            if self.icp_weight > 0:
              cloud_a = self.cloud[j][s]
              cloud_b = self.cloud[i][s]
              self.icp_transform[s][key], self.icp_residual[s][key] = icp(
                  cloud_a, egomotion, cloud_b)
              self.icp_transform_loss += 1.0 / (2**s) * tf.reduce_mean(
                  tf.abs(self.icp_transform[s][key]))
              self.icp_residual_loss += 1.0 / (2**s) * tf.reduce_mean(
                  tf.abs(self.icp_residual[s][key]))

      self.total_loss = self.reconstr_weight * self.reconstr_loss
      if self.smooth_weight > 0:
        self.total_loss += self.smooth_weight * self.smooth_loss
      if self.ssim_weight > 0:
        self.total_loss += self.ssim_weight * self.ssim_loss
      if self.icp_weight > 0:
        self.total_loss += self.icp_weight * (self.icp_transform_loss +
                                              self.icp_residual_loss)
Пример #7
0
def egomotion_net(image_stack, is_training=True, legacy_mode=False):
  """Predict ego-motion vectors from a stack of frames.

  Args:
    image_stack: Input tensor with shape [B, h, w, seq_length * 3].  Regardless
        of the value of legacy_mode, the input image sequence passed to the
        function should be in normal order, e.g. [1, 2, 3].
    is_training: Whether the model is being trained or not.
    legacy_mode: Setting legacy_mode to True enables compatibility with
        SfMLearner checkpoints.  When legacy_mode is on, egomotion_net()
        rearranges the input tensor to place the target (middle) frame first in
        sequence.  This is the arrangement of inputs that legacy models have
        received during training.  In legacy mode, the client program
        (model.Model.build_loss()) interprets the outputs of this network
        differently as well.  For example:

        When legacy_mode == True,
        Network inputs will be [2, 1, 3]
        Network outputs will be [1 -> 2, 3 -> 2]

        When legacy_mode == False,
        Network inputs will be [1, 2, 3]
        Network outputs will be [1 -> 2, 2 -> 3]

  Returns:
    Egomotion vectors with shape [B, seq_length - 1, 6].
  """
  seq_length = image_stack.get_shape()[3].value // 3  # 3 == RGB.
  if legacy_mode:
    # Put the target frame at the beginning of stack.
    with tf.name_scope('rearrange_stack'):
      mid_index = util.get_seq_middle(seq_length)
      left_subset = image_stack[:, :, :, :mid_index * 3]
      target_frame = image_stack[:, :, :, mid_index * 3:(mid_index + 1) * 3]
      right_subset = image_stack[:, :, :, (mid_index + 1) * 3:]
      image_stack = tf.concat([target_frame, left_subset, right_subset], axis=3)
  batch_norm_params = {'is_training': is_training}
  num_egomotion_vecs = seq_length - 1
  with tf.variable_scope('pose_exp_net') as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    normalizer_fn = slim.batch_norm if FLAGS.use_bn else None
    normalizer_params = batch_norm_params if FLAGS.use_bn else None
    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                        normalizer_fn=normalizer_fn,
                        weights_regularizer=slim.l2_regularizer(WEIGHT_REG),
                        normalizer_params=normalizer_params,
                        activation_fn=tf.nn.relu,
                        outputs_collections=end_points_collection):
      cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
      cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
      cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
      cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
      cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

      # Ego-motion specific layers
      with tf.variable_scope('pose'):
        cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6')
        cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
        pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
        egomotion_pred = slim.conv2d(cnv7,
                                     pred_channels,
                                     [1, 1],
                                     scope='pred',
                                     stride=1,
                                     normalizer_fn=None,
                                     activation_fn=None)
        egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
        # Tinghui found that scaling by a small constant facilitates training.
        egomotion_final = 0.01 * tf.reshape(
            egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])

      end_points = slim.utils.convert_collection_to_dict(end_points_collection)
      return egomotion_final, end_points