def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [None for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. if s == 0: # Just as a precaution. TF often has interpolation bugs. self.images[s] = self.image_stack else: height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_bilinear( self.image_stack, [height_s, width_s], align_corners=True) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # When computing minimum loss, use the depth map from the middle # frame only. if not self.compute_minimum_loss or i == self.middle_frame_index: disp_smoothing = self.disp[i][s] if self.depth_normalization: # Perform depth normalization, dividing by the mean. mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3], keep_dims=True) disp_input = disp_smoothing / mean_disp else: disp_input = disp_smoothing scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s)) self.smooth_loss += scaling_f * self.depth_smoothness( disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)]) self.debug_all_warped_image_batches = [] for i in range(self.seq_length): for j in range(self.seq_length): if i == j: continue # When computing minimum loss, only consider the middle frame as # target. if self.compute_minimum_loss and j != self.middle_frame_index: continue # We only consider adjacent frames, unless either # compute_minimum_loss is on (where the middle frame is matched with # all other frames) or exhaustive_mode is on (where all frames are # matched with each other). if (not self.compute_minimum_loss and not self.exhaustive_mode and abs(i - j) != 1): continue selected_scale = 0 if self.depth_upsampling else s source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)] target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)] if self.depth_upsampling: target_depth = self.depth_upsampled[j][s] else: target_depth = self.depth[j][s] key = '%d-%d' % (i, j) if self.handle_motion: # self.seg_stack of shape (B, H, W, 9). # target_depth corresponds to middle frame, of shape (B, H, W, 1). # Now incorporate the other warping results, performed according # to the object motion network's predictions. # self.object_masks batch_size elements of (N, H, W, 9). # self.object_masks_warped batch_size elements of (N, H, W, 9). # self.object_transforms batch_size elements of (N, 2, 6). self.all_batches = [] for batch_s in range(self.batch_size): # To warp i into j, first take the base warping (this is the # full image i warped into j using only the egomotion estimate). base_warping = self.warped_seq[s][i][batch_s] transform_matrices_thisbatch = tf.map_fn( lambda transform: project.get_transform_mat( tf.expand_dims(transform, axis=0), i, j)[0], self.object_transforms[0][batch_s]) def inverse_warp_wrapper(matrix): """Wrapper for inverse warping method.""" warp_image, _ = ( project.inverse_warp( tf.expand_dims(base_warping, axis=0), tf.expand_dims(target_depth[batch_s], axis=0), tf.expand_dims(matrix, axis=0), tf.expand_dims(self.intrinsic_mat[ batch_s, selected_scale, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[ batch_s, selected_scale, :, :], axis=0))) return warp_image warped_images_thisbatch = tf.map_fn( inverse_warp_wrapper, transform_matrices_thisbatch, dtype=tf.float32) warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :] # warped_images_thisbatch is now of shape (N, H, W, 9). # Combine warped frames into a single one, using the object # masks. Result should be (1, 128, 416, 3). # Essentially, we here want to sum them all up, filtered by the # respective object masks. mask_base_valid_source = tf.equal( self.seg_stack[batch_s, :, :, i*3:(i+1)*3], tf.constant(0, dtype=tf.uint8)) mask_base_valid_target = tf.equal( self.seg_stack[batch_s, :, :, j*3:(j+1)*3], tf.constant(0, dtype=tf.uint8)) mask_valid = tf.logical_and( mask_base_valid_source, mask_base_valid_target) self.base_warping = base_warping * tf.to_float(mask_valid) background = tf.expand_dims(self.base_warping, axis=0) def construct_const_filter_tensor(obj_id): return tf.fill( dims=[self.img_height, self.img_width, 3], value=tf.sign(obj_id)) * tf.to_float( tf.equal(self.seg_stack[batch_s, :, :, 3:6], tf.cast(obj_id, dtype=tf.uint8))) filter_tensor = tf.map_fn( construct_const_filter_tensor, tf.to_float(self.object_ids[s][batch_s])) filter_tensor = tf.stack(filter_tensor, axis=0) objects_to_add = tf.reduce_sum( tf.multiply(warped_images_thisbatch, filter_tensor), axis=0, keepdims=True) combined = background + objects_to_add self.all_batches.append(combined) # Now of shape (B, 128, 416, 3). self.warped_image[s][key] = tf.concat(self.all_batches, axis=0) else: # Don't handle motion, classic model formulation. egomotion_mat_i_j = project.get_transform_mat( self.egomotion, i, j) # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp( source, target_depth, egomotion_mat_i_j, self.intrinsic_mat[:, selected_scale, :, :], self.intrinsic_mat_inv[:, selected_scale, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target) if not self.compute_minimum_loss: self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim(self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). if not self.compute_minimum_loss: ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # If the minimum loss should be computed, the loss calculation has been # postponed until here. if self.compute_minimum_loss: for frame_index in range(self.middle_frame_index): key1 = '%d-%d' % (frame_index, self.middle_frame_index) key2 = '%d-%d' % (self.seq_length - frame_index - 1, self.middle_frame_index) logging.info('computing min error between %s and %s', key1, key2) min_error = tf.minimum(self.warp_error[s][key1], self.warp_error[s][key2]) self.reconstr_loss += tf.reduce_mean(min_error) if self.ssim_weight > 0: # Also compute the minimum SSIM loss. min_error_ssim = tf.minimum(self.ssim_error[s][key1], self.ssim_error[s][key2]) self.ssim_loss += tf.reduce_mean(min_error_ssim) # Build the total loss as composed of L1 reconstruction, SSIM, smoothing # and object size constraint loss as appropriate. self.reconstr_loss *= self.reconstr_weight self.total_loss = self.reconstr_loss if self.smooth_weight > 0: self.smooth_loss *= self.smooth_weight self.total_loss += self.smooth_loss if self.ssim_weight > 0: self.ssim_loss *= self.ssim_weight self.total_loss += self.ssim_loss if self.size_constraint_weight > 0: self.inf_loss *= self.size_constraint_weight self.total_loss += self.inf_loss
def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [{} for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_area(self.image_stack, [height_s, width_s]) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # In legacy mode, use the depth map from the middle frame only. if not self.legacy_mode or i == self.middle_frame_index: self.smooth_loss += 1.0 / ( 2**s) * self.depth_smoothness( self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)]) for i in range(self.seq_length): for j in range(self.seq_length): # Only consider adjacent frames. if i == j or abs(i - j) != 1: continue # In legacy mode, only consider the middle frame as target. if self.legacy_mode and j != self.middle_frame_index: continue source = self.images[s][:, :, :, 3 * i:3 * (i + 1)] target = self.images[s][:, :, :, 3 * j:3 * (j + 1)] target_depth = self.depth[j][s] key = '%d-%d' % (i, j) # Extract ego-motion from i to j egomotion_index = min(i, j) egomotion_mult = 1 if i > j: # Need to inverse egomotion when going back in sequence. egomotion_mult *= -1 # For compatiblity with SfMLearner, interpret all egomotion vectors # as pointing toward the middle frame. Note that unlike SfMLearner, # each vector captures the motion to/from its next frame, and not # the center frame. Although with seq_length == 3, there is no # difference. if self.legacy_mode: if egomotion_index >= self.middle_frame_index: egomotion_mult *= -1 egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :] # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp( source, target_depth, egomotion, self.intrinsic_mat[:, s, :, :], self.intrinsic_mat_inv[:, s, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs( self.warped_image[s][key] - target) self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim( self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). ssim_mask = slim.avg_pool2d( self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # 3D loss. if self.icp_weight > 0: cloud_a = self.cloud[j][s] cloud_b = self.cloud[i][s] self.icp_transform[s][key], self.icp_residual[s][ key] = icp(cloud_a, egomotion, cloud_b) self.icp_transform_loss += 1.0 / ( 2**s) * tf.reduce_mean( tf.abs(self.icp_transform[s][key])) self.icp_residual_loss += 1.0 / ( 2**s) * tf.reduce_mean( tf.abs(self.icp_residual[s][key])) self.total_loss = self.reconstr_weight * self.reconstr_loss if self.smooth_weight > 0: self.total_loss += self.smooth_weight * self.smooth_loss if self.ssim_weight > 0: self.total_loss += self.ssim_weight * self.ssim_loss if self.icp_weight > 0: self.total_loss += self.icp_weight * (self.icp_transform_loss + self.icp_residual_loss)
def egomotion_net(image_stack, is_training=True, legacy_mode=False): """Predict ego-motion vectors from a stack of frames. Args: image_stack: Input tensor with shape [B, h, w, seq_length * 3]. Regardless of the value of legacy_mode, the input image sequence passed to the function should be in normal order, e.g. [1, 2, 3]. is_training: Whether the model is being trained or not. legacy_mode: Setting legacy_mode to True enables compatibility with SfMLearner checkpoints. When legacy_mode is on, egomotion_net() rearranges the input tensor to place the target (middle) frame first in sequence. This is the arrangement of inputs that legacy models have received during training. In legacy mode, the client program (model.Model.build_loss()) interprets the outputs of this network differently as well. For example: When legacy_mode == True, Network inputs will be [2, 1, 3] Network outputs will be [1 -> 2, 3 -> 2] When legacy_mode == False, Network inputs will be [1, 2, 3] Network outputs will be [1 -> 2, 2 -> 3] Returns: Egomotion vectors with shape [B, seq_length - 1, 6]. """ seq_length = image_stack.get_shape()[3].value // 3 # 3 == RGB. if legacy_mode: # Put the target frame at the beginning of stack. with tf.name_scope('rearrange_stack'): mid_index = util.get_seq_middle(seq_length) left_subset = image_stack[:, :, :, :mid_index * 3] target_frame = image_stack[:, :, :, mid_index * 3:(mid_index + 1) * 3] right_subset = image_stack[:, :, :, (mid_index + 1) * 3:] image_stack = tf.concat([target_frame, left_subset, right_subset], axis=3) batch_norm_params = {'is_training': is_training} num_egomotion_vecs = seq_length - 1 with tf.variable_scope('pose_exp_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' normalizer_fn = slim.batch_norm if True else None normalizer_params = batch_norm_params if True else None with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], normalizer_fn=normalizer_fn, weights_regularizer=slim.l2_regularizer(WEIGHT_REG), normalizer_params=normalizer_params, activation_fn=tf.nn.relu, outputs_collections=end_points_collection): cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') # Ego-motion specific layers with tf.variable_scope('pose'): cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2]) # Tinghui found that scaling by a small constant facilitates training. egomotion_final = 0.01 * tf.reshape( egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE]) end_points = slim.utils.convert_collection_to_dict( end_points_collection) return egomotion_final, end_points
def region_deformer_net(image_stack, disp_bottleneck_stack, joint_encoder, seq_length, weight_reg, trans_params_size=32, region_deformer_scaling=1.0): """Predict region deformer parameters from a stack of frames or embeddings. Args: image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order. disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden, seq_length * c_hidden] in order. joint_encoder: Determines if the same encoder is used for computing the bottleneck layer of both the egomotion and the depth prediction network. If enabled, disp_bottleneck_stack is used as input, and the encoding steps are skipped. If disabled, a separate encoder is defined on image_stack. seq_length: The sequence length used. weight_reg: The amount of weight regularization. trans_params_size: Number of parameters of region deformer, 32 for bicubic function region_deformer_scaling: scaling factor for output Returns: Transformation parameters with shape [B, seq_length - 1, 32]. """ # Rearrange the image_stack as [1, 0, 2], output will be [1 -> 0, 1 -> 2] with tf.name_scope('rearrange_stack'): mid_index = util.get_seq_middle(seq_length) left_subset = image_stack[:, :, :, :mid_index * 3] target_frame = image_stack[:, :, :, mid_index * 3:(mid_index + 1) * 3] right_subset = image_stack[:, :, :, (mid_index + 1) * 3:] image_stack = tf.concat([target_frame, left_subset, right_subset], axis=3) num_transforms = seq_length - 1 with tf.variable_scope('region_deformer_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(weight_reg), normalizer_params=None, activation_fn=tf.nn.relu, outputs_collections=end_points_collection): if not joint_encoder: # Define separate encoder. If sharing, we can skip the encoding step, # as the bottleneck layer will already be passed as input. cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') with tf.variable_scope('region_deformer'): inputs = disp_bottleneck_stack if joint_encoder else cnv5 cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pred_channels = trans_params_size * num_transforms trans_params_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) trans_params_avg = tf.reduce_mean(trans_params_pred, [1, 2]) trans_params_res = tf.reshape( trans_params_avg, [-1, num_transforms, trans_params_size]) trans_params_scaled = region_deformer_scaling * trans_params_res return trans_params_scaled
def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [{} for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_area(self.image_stack, [height_s, width_s]) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # In legacy mode, use the depth map from the middle frame only. if not self.legacy_mode or i == self.middle_frame_index: self.smooth_loss += 1.0 / (2**s) * self.depth_smoothness( self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)]) for i in range(self.seq_length): for j in range(self.seq_length): # Only consider adjacent frames. if i == j or abs(i - j) != 1: continue # In legacy mode, only consider the middle frame as target. if self.legacy_mode and j != self.middle_frame_index: continue source = self.images[s][:, :, :, 3 * i:3 * (i + 1)] target = self.images[s][:, :, :, 3 * j:3 * (j + 1)] target_depth = self.depth[j][s] key = '%d-%d' % (i, j) # Extract ego-motion from i to j egomotion_index = min(i, j) egomotion_mult = 1 if i > j: # Need to inverse egomotion when going back in sequence. egomotion_mult *= -1 # For compatiblity with SfMLearner, interpret all egomotion vectors # as pointing toward the middle frame. Note that unlike SfMLearner, # each vector captures the motion to/from its next frame, and not # the center frame. Although with seq_length == 3, there is no # difference. if self.legacy_mode: if egomotion_index >= self.middle_frame_index: egomotion_mult *= -1 egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :] # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp(source, target_depth, egomotion, self.intrinsic_mat[:, s, :, :], self.intrinsic_mat_inv[:, s, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target) self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim(self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # 3D loss. if self.icp_weight > 0: cloud_a = self.cloud[j][s] cloud_b = self.cloud[i][s] self.icp_transform[s][key], self.icp_residual[s][key] = icp( cloud_a, egomotion, cloud_b) self.icp_transform_loss += 1.0 / (2**s) * tf.reduce_mean( tf.abs(self.icp_transform[s][key])) self.icp_residual_loss += 1.0 / (2**s) * tf.reduce_mean( tf.abs(self.icp_residual[s][key])) self.total_loss = self.reconstr_weight * self.reconstr_loss if self.smooth_weight > 0: self.total_loss += self.smooth_weight * self.smooth_loss if self.ssim_weight > 0: self.total_loss += self.ssim_weight * self.ssim_loss if self.icp_weight > 0: self.total_loss += self.icp_weight * (self.icp_transform_loss + self.icp_residual_loss)
def egomotion_net(image_stack, is_training=True, legacy_mode=False): """Predict ego-motion vectors from a stack of frames. Args: image_stack: Input tensor with shape [B, h, w, seq_length * 3]. Regardless of the value of legacy_mode, the input image sequence passed to the function should be in normal order, e.g. [1, 2, 3]. is_training: Whether the model is being trained or not. legacy_mode: Setting legacy_mode to True enables compatibility with SfMLearner checkpoints. When legacy_mode is on, egomotion_net() rearranges the input tensor to place the target (middle) frame first in sequence. This is the arrangement of inputs that legacy models have received during training. In legacy mode, the client program (model.Model.build_loss()) interprets the outputs of this network differently as well. For example: When legacy_mode == True, Network inputs will be [2, 1, 3] Network outputs will be [1 -> 2, 3 -> 2] When legacy_mode == False, Network inputs will be [1, 2, 3] Network outputs will be [1 -> 2, 2 -> 3] Returns: Egomotion vectors with shape [B, seq_length - 1, 6]. """ seq_length = image_stack.get_shape()[3].value // 3 # 3 == RGB. if legacy_mode: # Put the target frame at the beginning of stack. with tf.name_scope('rearrange_stack'): mid_index = util.get_seq_middle(seq_length) left_subset = image_stack[:, :, :, :mid_index * 3] target_frame = image_stack[:, :, :, mid_index * 3:(mid_index + 1) * 3] right_subset = image_stack[:, :, :, (mid_index + 1) * 3:] image_stack = tf.concat([target_frame, left_subset, right_subset], axis=3) batch_norm_params = {'is_training': is_training} num_egomotion_vecs = seq_length - 1 with tf.variable_scope('pose_exp_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' normalizer_fn = slim.batch_norm if FLAGS.use_bn else None normalizer_params = batch_norm_params if FLAGS.use_bn else None with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=normalizer_fn, weights_regularizer=slim.l2_regularizer(WEIGHT_REG), normalizer_params=normalizer_params, activation_fn=tf.nn.relu, outputs_collections=end_points_collection): cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') # Ego-motion specific layers with tf.variable_scope('pose'): cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2]) # Tinghui found that scaling by a small constant facilitates training. egomotion_final = 0.01 * tf.reshape( egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE]) end_points = slim.utils.convert_collection_to_dict(end_points_collection) return egomotion_final, end_points