def create_and_fetch_intrinsics_per_video_index(video_index, height, width, max_video_index=1000, num_summaries=10): """Fetches the intrinsic mcatrix of a batch of video index. Args: video_index: A batch of scalars (int32-s) representing video indices, must be in [0, max_video_index). height: Image height in pixels. width: Image width in pixels. max_video_index: Maximum video_index (video_index < max_video_index). num_summaries: Number of video_indices for which intrinsics will be displayed on TensorBoard. Returns: A batch of intrinsics matrices (shape: [B, 3, 3], where B is the length of `video_index` """ intrin_initializer = tf.tile([[1.0, 1.0, 0.5, 0.5]], [max_video_index, 1]) intrin_factors = tf.compat.v1.get_variable('all_intrin', initializer=intrin_initializer) batch_factors = tf.gather(intrin_factors, video_index) fx, fy, x0, y0 = _get_intrinsics_from_coefficients(batch_factors, height, width) zero = tf.zeros_like(fx) one = tf.ones_like(fx) int_mat = [[fx, zero, x0], [zero, fy, y0], [zero, zero, one]] int_mat = tf.transpose(int_mat, [2, 0, 1]) if num_summaries > 0: fx, fy, x0, y0 = _get_intrinsics_from_coefficients( intrin_factors, height, width) for i in range(num_summaries): maybe_summary.scalar('intrinsics/0%d/fx' % i, fx[i]) maybe_summary.scalar('intrinsics/0%d/fy' % i, fy[i]) maybe_summary.scalar('intrinsics/0%d/x0' % i, x0[i]) maybe_summary.scalar('intrinsics/0%d/y0' % i, y0[i]) maybe_summary.histogram('intrinsics/fx', fx) maybe_summary.histogram('intrinsics/fy', fy) maybe_summary.histogram('intrinsics/x0', x0) maybe_summary.histogram('intrinsics/y0', y0) return int_mat
def loss_fn(features, mode, params): """Computes the training loss for depth and egomotion training. This function is written with TPU-friendlines in mind. Args: features: A dictionary mapping strings to tuples of (tf.Tensor, tf.Tensor), representing pairs of frames. The loss will be calculated from these tensors. The expected endpoints are 'rgb', 'depth', 'intrinsics_mat' and 'intrinsics_mat_inv'. mode: One of tf.estimator.ModeKeys: TRAIN, PREDICT or EVAL. params: A dictionary with hyperparameters that optionally override DEFAULT_PARAMS above. Returns: A dictionary mapping each loss name (see DEFAULT_PARAMS['loss_weights']'s keys) to a scalar tf.Tensor representing the respective loss. The total training loss. Raises: ValueError: `features` endpoints that don't conform with their expected structure. """ params = parameter_container.ParameterContainer.from_defaults_and_overrides( DEFAULT_PARAMS, params, is_strict=True, strictness_depth=2) if len(features['rgb']) != 2 or 'depth' in features and len( features['depth']) != 2: raise ValueError( 'RGB and depth endpoints are expected to be a tuple of two' ' tensors. Rather, they are %s.' % str(features)) # On tpu we strive to stack tensors together and perform ops once on the # entire stack, to save time HBM memory. We thus stack the batch-of-first- # frames and the batch-of-second frames, for both depth and RGB. The batch # dimension of rgb_stack and gt_depth_stack are thus twice the original batch # size. rgb_stack = tf.concat(features['rgb'], axis=0) depth_predictor = depth_prediction_nets.ResNet18DepthPredictor( mode, params.depth_predictor_params.as_dict()) predicted_depth = depth_predictor.predict_depth(rgb_stack) maybe_summary.histogram('PredictedDepth', predicted_depth) endpoints = {} endpoints['predicted_depth'] = tf.split(predicted_depth, 2, axis=0) endpoints['rgb'] = features['rgb'] # We make the heuristic that depths that are less than 0.2 meters are not # accurate. This is a rough placeholder for a confidence map that we're going # to have in future. if 'depth' in features: endpoints['groundtruth_depth'] = features['depth'] if params.cascade: motion_features = [ tf.concat([features['rgb'][0], endpoints['predicted_depth'][0]], axis=-1), tf.concat([features['rgb'][1], endpoints['predicted_depth'][1]], axis=-1) ] else: motion_features = features['rgb'] motion_features_stack = tf.concat(motion_features, axis=0) flipped_motion_features_stack = tf.concat(motion_features[::-1], axis=0) # Unlike `rgb_stack`, here we stacked the frames in reverse order along the # Batch dimension. By concatenating the two stacks below along the channel # axis, we create the following tensor: # # Channel dimension (3) # _ _ # | Frame1-s batch | Frame2-s batch |____Batch # |_ Frame2-s batch | Frame1-s batch _| dimension (0) # # When we send this tensor to the motion prediction network, the first and # second halves of the result represent the camera motion from Frame1 to # Frame2 and from Frame2 to Frame1 respectively. Further below we impose a # loss that drives these two to be the inverses of one another # (cycle-consistency). pairs = tf.concat([motion_features_stack, flipped_motion_features_stack], axis=-1) rot, trans, residual_translation, intrinsics_mat = ( object_motion_nets.motion_field_net( images=pairs, weight_reg=params.motion_prediction_params.weight_reg, align_corners=params.motion_prediction_params.align_corners, auto_mask=params.motion_prediction_params.auto_mask)) if params.motion_field_burnin_steps > 0.0: step = tf.to_float(tf.train.get_or_create_global_step()) burnin_steps = tf.to_float(params.motion_field_burnin_steps) residual_translation *= tf.clip_by_value(2 * step / burnin_steps - 1, 0.0, 1.0) # If using grouth truth egomotion if not params.learn_egomotion: egomotion_mat = tf.concat(features['egomotion_mat'], axis=0) rot = transform_utils.angles_from_matrix(egomotion_mat[:, :3, :3]) trans = egomotion_mat[:, :3, 3] trans = tf.expand_dims(trans, 1) trans = tf.expand_dims(trans, 1) if params.use_mask: mask = tf.to_float(tf.concat(features['mask'], axis=0) > 0) if params.foreground_dilation > 0: pool_size = params.foreground_dilation * 2 + 1 mask = tf.nn.max_pool(mask, [1, pool_size, pool_size, 1], [1] * 4, 'SAME') residual_translation *= mask maybe_summary.histogram('ResidualTranslation', residual_translation) maybe_summary.histogram('BackgroundTranslation', trans) maybe_summary.histogram('Rotation', rot) endpoints['residual_translation'] = tf.split(residual_translation, 2, axis=0) endpoints['background_translation'] = tf.split(trans, 2, axis=0) endpoints['rotation'] = tf.split(rot, 2, axis=0) if not params.learn_intrinsics.enabled: endpoints['intrinsics_mat'] = features['intrinsics_mat'] endpoints['intrinsics_mat_inv'] = features['intrinsics_mat_inv'] elif params.learn_intrinsics.per_video: int_mat = intrinsics_utils.create_and_fetch_intrinsics_per_video_index( features['video_index'][0], params.image_preprocessing.image_height, params.image_preprocessing.image_width, max_video_index=params.learn_intrinsics.max_number_of_videos) endpoints['intrinsics_mat'] = tf.concat([int_mat] * 2, axis=0) endpoints[ 'intrinsics_mat_inv'] = intrinsics_utils.invert_intrinsics_matrix( int_mat) else: # The intrinsic matrix should be the same, no matter the order of # images (mat = inv_mat). It's probably a good idea to enforce this # by a loss, but for now we just take their average as a prediction for the # intrinsic matrix. intrinsics_mat = 0.5 * sum(tf.split(intrinsics_mat, 2, axis=0)) endpoints['intrinsics_mat'] = [intrinsics_mat] * 2 endpoints['intrinsics_mat_inv'] = [ intrinsics_utils.invert_intrinsics_matrix(intrinsics_mat) ] * 2 aggregator = loss_aggregator.DepthMotionFieldLossAggregator( endpoints, params.loss_weights.as_dict(), params.loss_params.as_dict()) # Add some more summaries. maybe_summary.image('rgb0', features['rgb'][0]) maybe_summary.image('rgb1', features['rgb'][1]) disp0, disp1 = tf.split(aggregator.output_endpoints['disparity'], 2, axis=0) maybe_summary.image('disparity0/grayscale', disp0) maybe_summary.image_with_colormap('disparity0/plasma', tf.squeeze(disp0, axis=3), 'plasma', 0.0) maybe_summary.image('disparity1/grayscale', disp1) maybe_summary.image_with_colormap('disparity1/plasma', tf.squeeze(disp1, axis=3), 'plasma', 0.0) if maybe_summary.summaries_enabled(): if 'depth' in features: gt_disp0 = 1.0 / tf.maximum(features['depth'][0], 0.5) gt_disp1 = 1.0 / tf.maximum(features['depth'][1], 0.5) maybe_summary.image('disparity_gt0', gt_disp0) maybe_summary.image('disparity_gt1', gt_disp1) depth_proximity_weight0, depth_proximity_weight1 = tf.split( aggregator.output_endpoints['depth_proximity_weight'], 2, axis=0) maybe_summary.image('consistency_weight0', tf.expand_dims(depth_proximity_weight0, -1)) maybe_summary.image('consistency_weight1', tf.expand_dims(depth_proximity_weight1, -1)) maybe_summary.image('trans', aggregator.output_endpoints['trans']) maybe_summary.image('trans_inv', aggregator.output_endpoints['inv_trans']) maybe_summary.image('trans_res', endpoints['residual_translation'][0]) maybe_summary.image('trans_res_inv', endpoints['residual_translation'][1]) return aggregator.losses