Exemplo n.º 1
0
    def _build_egomotion_test_graph(self):
        """Builds graph for inference of egomotion given two images."""
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            self._image1 = tf.placeholder(
                tf.float32,
                [self.batch_size, self.img_height, self.img_width, 3],
                name='image1')
            self._image2 = tf.placeholder(
                tf.float32,
                [self.batch_size, self.img_height, self.img_width, 3],
                name='image2')
            # The "compute_loss" scope is needed for the checkpoint to load properly.
            with tf.name_scope('compute_loss'):
                rot, trans, _, _ = motion_prediction_net.motion_field_net(
                    images=tf.concat([self._image1, self._image2], axis=-1))
                inv_rot, inv_trans, _, _ = (
                    motion_prediction_net.motion_field_net(images=tf.concat(
                        [self._image2, self._image1], axis=-1)))

            rot = transform_utils.matrix_from_angles(rot)
            inv_rot = transform_utils.matrix_from_angles(inv_rot)
            trans = tf.squeeze(trans, axis=(1, 2))
            inv_trans = tf.squeeze(inv_trans, axis=(1, 2))

            # rot and inv_rot should be the inverses on of the other, but in reality
            # they slightly differ. Averaging rot and inv(inv_rot) gives a better
            # estimator for the rotation. Similarly, trans and rot*inv_trans should
            # be the negatives one of the other, so we average rot*inv_trans and trans
            # to get a better estimator. TODO(gariel): Check if there's an estimator
            # with less variance.
            self.rot = 0.5 * (tf.linalg.inv(inv_rot) + rot)
            self.trans = 0.5 * (-tf.squeeze(
                tf.matmul(self.rot, tf.expand_dims(inv_trans, -1)), axis=-1) +
                                trans)
Exemplo n.º 2
0
    def _build_loss(self):
        """Builds the loss tensor, to be minimized by the optimizer."""
        self.reader = reader.DataReader(
            self.data_dir,
            self.batch_size,
            self.img_height,
            self.img_width,
            SEQ_LENGTH,
            1,  # num_scales
            self.file_extension,
            self.random_scale_crop,
            reader.FLIP_RANDOM,
            self.random_color,
            self.imagenet_norm,
            self.shuffle,
            self.input_file,
            queue_size=self.queue_size)

        (self.image_stack, self.image_stack_norm, self.seg_stack,
         self.intrinsic_mat, _) = self.reader.read_data()
        if self.learn_intrinsics:
            self.intrinsic_mat = None
        if self.intrinsic_mat is None and not self.learn_intrinsics:
            raise RuntimeError(
                'Could not read intrinsic matrix. Turn '
                'learn_intrinsics on to learn it instead of loading '
                'it.')
        self.export('self.image_stack', self.image_stack)

        object_masks = []
        for i in range(self.batch_size):
            object_ids = tf.unique(tf.reshape(self.seg_stack[i], [-1]))[0]
            object_masks_i = []
            for j in range(SEQ_LENGTH):
                current_seg = self.seg_stack[i, :, :, j * 3]  # (H, W)

                def process_obj_mask(obj_id):
                    """Create a mask for obj_id, skipping the background mask."""
                    mask = tf.logical_and(
                        tf.equal(current_seg, obj_id),  # pylint: disable=cell-var-from-loop
                        tf.not_equal(tf.cast(0, tf.uint8), obj_id))
                    # Leave out vert small masks, that are most often errors.
                    size = tf.reduce_sum(tf.to_int32(mask))
                    mask = tf.logical_and(mask,
                                          tf.greater(size, MIN_OBJECT_AREA))
                    if not self.boxify:
                        return mask
                    # Complete the mask to its bounding box.
                    binary_obj_masks_y = tf.reduce_any(mask,
                                                       axis=1,
                                                       keepdims=True)
                    binary_obj_masks_x = tf.reduce_any(mask,
                                                       axis=0,
                                                       keepdims=True)
                    return tf.logical_and(binary_obj_masks_y,
                                          binary_obj_masks_x)

                object_mask = tf.map_fn(  # (N, H, W)
                    process_obj_mask, object_ids, dtype=tf.bool)
                object_mask = tf.reduce_any(object_mask, axis=0)
                object_masks_i.append(object_mask)
            object_masks.append(tf.stack(object_masks_i, axis=-1))

        self.seg_stack = tf.cast(tf.stack(object_masks, axis=0), tf.float)
        tf.summary.image('Masks', self.seg_stack)

        with tf.variable_scope(DEPTH_SCOPE):
            # Organized by ...[i][scale].  Note that the order is flipped in
            # variables in build_loss() below.
            self.disp = {}
            self.depth = {}

            # Parabolic rampup of he noise over LAYER_NORM_NOISE_RAMPUP_STEPS steps.
            # We stop at 0.5 because this is the value above which the multiplicative
            # noise we use can become negative. Further experimentation is needed to
            # find if non-negativity is indeed needed.
            noise_stddev = 0.5 * tf.square(
                tf.minimum(
                    tf.cast(self.global_step, tf.float) /
                    float(LAYER_NORM_NOISE_RAMPUP_STEPS), 1.0))

            def _normalizer_fn(x, is_train, name='bn'):
                return randomized_layer_normalization.normalize(
                    x, is_train=is_train, name=name, stddev=noise_stddev)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                for i in range(SEQ_LENGTH):
                    image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)]
                    self.depth[
                        i] = depth_prediction_net.depth_prediction_resnet18unet(
                            image, True, self.weight_reg, _normalizer_fn)
                    self.disp[i] = 1.0 / self.depth[i]

        with tf.name_scope('compute_loss'):
            self.reconstr_loss = 0
            self.smooth_loss = 0
            self.ssim_loss = 0
            self.depth_consistency_loss = 0

            # Smoothness.
            if self.smooth_weight > 0:
                for i in range(SEQ_LENGTH):
                    disp_smoothing = self.disp[i]
                    # Perform depth normalization, dividing by the mean.
                    mean_disp = tf.reduce_mean(disp_smoothing,
                                               axis=[1, 2, 3],
                                               keep_dims=True)
                    disp_input = disp_smoothing / mean_disp
                    self.smooth_loss += _depth_smoothness(
                        disp_input, self.image_stack[:, :, :,
                                                     3 * i:3 * (i + 1)])

            self.rot_loss = 0.0
            self.trans_loss = 0.0

            def add_result_to_loss_and_summaries(endpoints, i, j):
                tf.summary.image(
                    'valid_mask%d%d' % (i, j),
                    tf.expand_dims(endpoints['depth_proximity_weight'], -1))

                self.depth_consistency_loss += endpoints['depth_error']
                self.reconstr_loss += endpoints['rgb_error']
                self.ssim_loss += 0.5 * endpoints['ssim_error']
                self.rot_loss += endpoints['rotation_error']
                self.trans_loss += endpoints['translation_error']

            self.motion_smoothing = 0.0
            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                for i in range(SEQ_LENGTH - 1):
                    j = i + 1
                    depth_i = self.depth[i][:, :, :, 0]
                    depth_j = self.depth[j][:, :, :, 0]
                    image_j = self.image_stack[:, :, :, 3 * j:3 * (j + 1)]
                    image_i = self.image_stack[:, :, :, i * 3:(i + 1) * 3]
                    # We select a pair of consecutive images (and their respective
                    # predicted depth maps). Now we have the network predict a motion
                    # field that connects the two. We feed the pair of images into the
                    # network, once in forward order and then in reverse order. The
                    # results are fed into the loss calculation. The following losses are
                    # calculated:
                    # - RGB and SSIM photometric consistency.
                    # - Cycle consistency of rotations and translations for every pixel.
                    # - L1 smoothness of the disparity and the motion field.
                    # - Depth consistency
                    rot, trans, trans_res, mat = motion_prediction_net.motion_field_net(
                        images=tf.concat([image_i, image_j], axis=-1),
                        weight_reg=self.weight_reg)
                    inv_rot, inv_trans, inv_trans_res, inv_mat = (
                        motion_prediction_net.motion_field_net(
                            images=tf.concat([image_j, image_i], axis=-1),
                            weight_reg=self.weight_reg))

                    if self.learn_intrinsics:
                        intrinsic_mat = 0.5 * (mat + inv_mat)
                    else:
                        intrinsic_mat = self.intrinsic_mat[:, 0, :, :]

                    def dilate(x):
                        # Dilation by n pixels is roughtly max pooling by 2 * n + 1.
                        p = self.foreground_dilation * 2 + 1
                        return tf.nn.max_pool(x, [1, p, p, 1], [1] * 4, 'SAME')

                    trans += trans_res * dilate(self.seg_stack[:, :, :,
                                                               j:j + 1])
                    inv_trans += inv_trans_res * dilate(
                        self.seg_stack[:, :, :, i:i + 1])

                    tf.summary.image('trans%d%d' % (i, i + 1), trans)
                    tf.summary.image('trans%d%d' % (i + 1, i), inv_trans)

                    tf.summary.image('trans_res%d%d' % (i + 1, i),
                                     inv_trans_res)
                    tf.summary.image('trans_res%d%d' % (i, i + 1), trans_res)

                    self.motion_smoothing += _smoothness(trans)
                    self.motion_smoothing += _smoothness(inv_trans)
                    tf.summary.scalar(
                        'trans_stdev',
                        tf.sqrt(0.5 * tf.reduce_mean(
                            tf.square(trans) + tf.square(inv_trans))))

                    transformed_depth_j = transform_depth_map.using_motion_vector(
                        depth_j, trans, rot, intrinsic_mat)

                    add_result_to_loss_and_summaries(
                        consistency_losses.rgbd_and_motion_consistency_loss(
                            transformed_depth_j, image_j, depth_i, image_i,
                            rot, trans, inv_rot, inv_trans), i, j)

                    transformed_depth_i = transform_depth_map.using_motion_vector(
                        depth_i, inv_trans, inv_rot, intrinsic_mat)

                    add_result_to_loss_and_summaries(
                        consistency_losses.rgbd_and_motion_consistency_loss(
                            transformed_depth_i, image_i, depth_j, image_j,
                            inv_rot, inv_trans, rot, trans), j, i)

            # Build the total loss as composed of L1 reconstruction, SSIM, smoothing
            # and object size constraint loss as appropriate.
            self.reconstr_loss *= self.reconstr_weight
            self.export('self.reconstr_loss', self.reconstr_loss)
            self.total_loss = self.reconstr_loss
            if self.smooth_weight > 0:
                self.smooth_loss *= self.smooth_weight
                self.total_loss += self.smooth_loss
                self.export('self.smooth_loss', self.smooth_loss)
            if self.ssim_weight > 0:
                self.ssim_loss *= self.ssim_weight
                self.total_loss += self.ssim_loss
                self.export('self.ssim_loss', self.ssim_loss)

            if self.motion_smoothing_weight > 0:
                self.motion_smoothing *= self.motion_smoothing_weight
                self.total_loss += self.motion_smoothing
                self.export('self.motion_sm_loss', self.motion_smoothing)

            if self.depth_consistency_loss_weight:
                self.depth_consistency_loss *= self.depth_consistency_loss_weight
                self.total_loss += self.depth_consistency_loss
                self.export('self.depth_consistency_loss',
                            self.depth_consistency_loss)

            self.rot_loss *= self.rotation_consistency_weight
            self.trans_loss *= self.translation_consistency_weight
            self.export('rot_loss', self.rot_loss)
            self.export('trans_loss', self.trans_loss)

            self.total_loss += self.rot_loss
            self.total_loss += self.trans_loss

            self.export('self.total_loss', self.total_loss)