def construct_model_pwc_full(image1, image2, feature1, feature2): with tf.variable_scope('flow_net'): batch_size, H, W, color_channels = map(int, image1.get_shape()[0:4]) ############################# feature1_1, feature1_2, feature1_3, feature1_4, feature1_5, feature1_6 = feature1 feature2_1, feature2_2, feature2_3, feature2_4, feature2_5, feature2_6 = feature2 cv6 = cost_volumn(feature1_6, feature2_6, d=4) flow6, _ = optical_flow_decoder_dc(cv6, level=6) flow6to5 = tf.image.resize_bilinear(flow6, [H / (2**5), (W / (2**5))]) * 2.0 feature2_5w = transformer_old(feature2_5, flow6to5, [H / 32, W / 32]) cv5 = cost_volumn(feature1_5, feature2_5w, d=4) flow5, _ = optical_flow_decoder_dc(tf.concat( [cv5, feature1_5, flow6to5], axis=3), level=5) flow5 = flow5 + flow6to5 flow5to4 = tf.image.resize_bilinear(flow5, [H / (2**4), (W / (2**4))]) * 2.0 feature2_4w = transformer_old(feature2_4, flow5to4, [H / 16, W / 16]) cv4 = cost_volumn(feature1_4, feature2_4w, d=4) flow4, _ = optical_flow_decoder_dc(tf.concat( [cv4, feature1_4, flow5to4], axis=3), level=4) flow4 = flow4 + flow5to4 flow4to3 = tf.image.resize_bilinear(flow4, [H / (2**3), (W / (2**3))]) * 2.0 feature2_3w = transformer_old(feature2_3, flow4to3, [H / 8, W / 8]) cv3 = cost_volumn(feature1_3, feature2_3w, d=4) flow3, _ = optical_flow_decoder_dc(tf.concat( [cv3, feature1_3, flow4to3], axis=3), level=3) flow3 = flow3 + flow4to3 flow3to2 = tf.image.resize_bilinear(flow3, [H / (2**2), (W / (2**2))]) * 2.0 feature2_2w = transformer_old(feature2_2, flow3to2, [H / 4, W / 4]) cv2 = cost_volumn(feature1_2, feature2_2w, d=4) flow2_raw, f2 = optical_flow_decoder_dc(tf.concat( [cv2, feature1_2, flow3to2], axis=3), level=2) flow2_raw = flow2_raw + flow3to2 flow2 = context_net(tf.concat([flow2_raw, f2], axis=3)) + flow2_raw flow0_enlarge = tf.image.resize_bilinear(flow2 * 4.0, [H, W]) flow1_enlarge = tf.image.resize_bilinear(flow3 * 4.0, [H // 2, W // 2]) flow2_enlarge = tf.image.resize_bilinear(flow4 * 4.0, [H // 4, W // 4]) flow3_enlarge = tf.image.resize_bilinear(flow5 * 4.0, [H // 8, W // 8]) return flow0_enlarge, flow1_enlarge, flow2_enlarge, flow3_enlarge
def generate_transformed(self, img, flow, scale): return transformer_old(img, flow, out_size=[ self.params.height // (2**scale), self.params.width // (2**scale) ])
def __init__(self, scope=None): with tf.variable_scope(scope, reuse=True): colour_channels = 1 if opt.grey_scale else 3 input_uint8_1 = tf.placeholder( tf.uint8, [1, opt.img_height, opt.img_width, colour_channels], name='raw_input_1') input_uint8_1r = tf.placeholder( tf.uint8, [1, opt.img_height, opt.img_width, colour_channels], name='raw_input_1r') input_uint8_2 = tf.placeholder( tf.uint8, [1, opt.img_height, opt.img_width, colour_channels], name='raw_input_2') input_uint8_2r = tf.placeholder( tf.uint8, [1, opt.img_height, opt.img_width, colour_channels], name='raw_input_2r') input_intrinsic = tf.placeholder(tf.float32, [3, 3]) cam2pix, pix2cam = get_multi_scale_intrinsics(input_intrinsic, opt.num_scales) cam2pix = tf.expand_dims(cam2pix, axis=0) pix2cam = tf.expand_dims(pix2cam, axis=0) input_1 = preprocess_image(input_uint8_1) input_2 = preprocess_image(input_uint8_2) input_1r = preprocess_image(input_uint8_1r) input_2r = preprocess_image(input_uint8_2r) feature1_disp = feature_pyramid_disp(input_1, reuse=True) feature1r_disp = feature_pyramid_disp(input_1r, reuse=True) feature2_disp = feature_pyramid_disp(input_2, reuse=True) feature2r_disp = feature_pyramid_disp(input_2r, reuse=True) feature1_flow = feature_pyramid_flow(input_1, reuse=True) feature2_flow = feature_pyramid_flow(input_2, reuse=True) pred_disp = disp_godard( input_1, input_1r, feature1_disp, feature1r_disp, opt, is_training=False) pred_disp_rev = disp_godard( input_2, input_2r, feature2_disp, feature2r_disp, opt, is_training=False) pred_poses = pose_exp_net(input_1, input_2) optical_flows = construct_model_pwc_full( input_1, input_2, feature1_flow, feature2_flow) optical_flows_rev = construct_model_pwc_full( input_2, input_1, feature2_flow, feature1_flow) s = 0 occu_mask = tf.clip_by_value( transformerFwd( tf.ones( shape=[ 1, opt.img_height // (2**s), opt.img_width // (2**s), 1 ], dtype='float32'), optical_flows_rev[s], [opt.img_height // (2**s), opt.img_width // (2**s)]), clip_value_min=0.0, clip_value_max=1.0) depth_flow, pose_mat, disp1_trans, small_mask = inverse_warp_new( 1.0 / pred_disp[0][:, :, :, 0:1], 1.0 / pred_disp_rev[0][:, :, :, 0:1], pred_poses, cam2pix[:, 0, :, :], pix2cam[:, 0, :, :], optical_flows[0], occu_mask) flow_diff = tf.sqrt( tf.reduce_sum( tf.square(depth_flow - optical_flows[0]), axis=3, keep_dims=True)) flow_diff_mask = tf.cast(flow_diff < (opt.flow_diff_threshold), tf.float32) occu_region = tf.cast(occu_mask < 0.5, tf.float32) ref_exp_mask = tf.clip_by_value( flow_diff_mask + occu_region, clip_value_min=0.0, clip_value_max=1.0) self.input_1 = input_uint8_1 self.input_2 = input_uint8_2 self.input_r = input_uint8_1r self.input_2r = input_uint8_2r self.input_intrinsic = input_intrinsic self.pred_pose_mat = pose_mat[0, :, :] self.pred_flow_rigid = depth_flow self.pred_flow_optical = optical_flows[0] self.pred_disp = pred_disp[0][:, :, :, 0:1] self.pred_disp2 = disp1_trans*0.0 + \ transformer_old(pred_disp_rev[0][:,:,:,0:1], optical_flows[0], [opt.img_height, opt.img_width])*(1.0-0.0) self.pred_mask = 1.0 - ref_exp_mask
def __init__(self, image1=None, image2=None, image1r=None, image2r=None, cam2pix=None, pix2cam=None, reuse_scope=False, scope=None): summaries = [] batch_size, H, W, color_channels = map(int, image1.get_shape()[0:4]) with tf.variable_scope(scope, reuse=reuse_scope): feature1_flow = feature_pyramid_flow(image1, reuse=False) feature2_flow = feature_pyramid_flow(image2, reuse=True) feature1_disp = feature_pyramid_disp(image1, reuse=False) feature1r_disp = feature_pyramid_disp(image1r, reuse=True) pred_disp, stereo_smooth_loss = disp_godard( image1, image1r, feature1_disp, feature1r_disp, opt, is_training=True) pred_depth = [1. / d for d in pred_disp] pred_poses = pose_exp_net(image1, image2) optical_flows_rev = construct_model_pwc_full( image2, image1, feature2_flow, feature1_flow) with tf.variable_scope(scope, reuse=True): feature2_disp = feature_pyramid_disp(image2, reuse=True) feature2r_disp = feature_pyramid_disp(image2r, reuse=True) pred_disp_rev = disp_godard( image2, image2r, feature2_disp, feature2r_disp, opt, is_training=False) optical_flows = construct_model_pwc_full( image1, image2, feature1_flow, feature2_flow) occu_masks = [ tf.clip_by_value( transformerFwd( tf.ones( shape=[batch_size, H / (2**s), W / (2**s), 1], dtype='float32'), flowr, [H / (2**s), W / (2**s)]), clip_value_min=0.0, clip_value_max=1.0) for s, flowr in enumerate(optical_flows_rev) ] _, pose_mat, _, _ = inverse_warp_new( 1.0 / pred_disp[0][:, :, :, 0:1], 1.0 / pred_disp_rev[0][:, :, :, 0:1], pred_poses, cam2pix[:, 0, :, :], pix2cam[:, 0, :, :], optical_flows[0], occu_masks[0]) pixel_loss_depth = 0 pixel_loss_optical = 0 exp_loss = 0 flow_smooth_loss = 0 flow_consist_loss = 0 tgt_image_all = [] src_image_all = [] proj_image_depth_all = [] proj_error_depth_all = [] flyout_map_all = [] for s in range(opt.num_scales): occu_mask = occu_masks[s] # Scale the source and target images for computing loss at the # according scale. curr_tgt_image = tf.image.resize_area( image1, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) curr_src_image = tf.image.resize_area( image2, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) depth_flow, pose_mat = inverse_warp( pred_depth[s][:, :, :, 0:1], tf.stop_gradient(pose_mat), cam2pix[:, s, :, :], ## [batchsize, scale, 3, 3] pix2cam[:, s, :, :]) depth_flow_orig, _ = inverse_warp( tf.stop_gradient(pred_depth[s][:, :, :, 0:1]), pred_poses, cam2pix[:, s, :, :], ## [batchsize, scale, 3, 3] pix2cam[:, s, :, :]) flow_diff = tf.sqrt( tf.reduce_sum( tf.square(depth_flow - optical_flows[s]), axis=3, keep_dims=True)) flow_diff_mask = tf.cast( flow_diff < (opt.flow_diff_threshold / 2**s), tf.float32) occu_region = tf.cast(occu_mask < 0.5, tf.float32) ref_exp_mask = tf.clip_by_value( flow_diff_mask + occu_region, clip_value_min=0.0, clip_value_max=1.0) occu_mask_avg = tf.reduce_mean(occu_mask) curr_proj_image_depth = transformer_old(curr_src_image, depth_flow, [H / (2**s), W / (2**s)]) curr_proj_error_depth = tf.abs(curr_proj_image_depth - curr_tgt_image) * ref_exp_mask pixel_loss_depth += (1.0 - opt.ssim_weight) * tf.reduce_mean( curr_proj_error_depth * occu_mask) / occu_mask_avg curr_proj_image_depth_orig = transformer_old( curr_src_image, depth_flow_orig, [H / (2**s), W / (2**s)]) curr_proj_error_depth_orig = tf.abs(curr_proj_image_depth_orig - curr_tgt_image) * ref_exp_mask pixel_loss_depth += (1.0 - opt.ssim_weight) * tf.reduce_mean( curr_proj_error_depth_orig * occu_mask) / occu_mask_avg curr_proj_image_optical = transformer_old( curr_src_image, optical_flows[s], [H / (2**s), W / (2**s)]) curr_proj_error_optical = tf.abs(curr_proj_image_optical - curr_tgt_image) pixel_loss_optical += (1.0 - opt.ssim_weight) * tf.reduce_mean( curr_proj_error_optical * occu_mask) / occu_mask_avg curr_flyout_map = occu_mask if opt.ssim_weight > 0: pixel_loss_depth += opt.ssim_weight * tf.reduce_mean( SSIM(curr_proj_image_depth * occu_mask * ref_exp_mask, curr_tgt_image * occu_mask * ref_exp_mask)) / occu_mask_avg pixel_loss_depth += opt.ssim_weight * tf.reduce_mean( SSIM(curr_proj_image_depth_orig * occu_mask * ref_exp_mask, curr_tgt_image * occu_mask * ref_exp_mask)) / occu_mask_avg pixel_loss_optical += opt.ssim_weight * tf.reduce_mean( SSIM(curr_proj_image_optical * occu_mask, curr_tgt_image * occu_mask)) / occu_mask_avg # flow_smooth_loss += opt.flow_smooth_weight * cal_grad2_error_mask( optical_flows[s] / 20.0, curr_tgt_image, 1.0, 1.0 - ref_exp_mask) depth_flow_stop = tf.stop_gradient(depth_flow) flow_consist_loss += opt.flow_consist_weight * charbonnier_loss( depth_flow_stop - optical_flows[s], ref_exp_mask) tgt_image_all.append(curr_tgt_image) src_image_all.append(curr_src_image) proj_image_depth_all.append(curr_proj_image_depth) proj_error_depth_all.append(curr_proj_error_depth) flyout_map_all.append(curr_flyout_map) self.loss = ( 10.0 * pixel_loss_depth + stereo_smooth_loss ) + pixel_loss_optical + flow_smooth_loss + flow_consist_loss summaries.append(tf.summary.scalar("total_loss", self.loss)) summaries.append( tf.summary.scalar("pixel_loss_depth", pixel_loss_depth)) summaries.append( tf.summary.scalar("pixel_loss_optical", pixel_loss_optical)) summaries.append(tf.summary.scalar("exp_loss", exp_loss)) summaries.append( tf.summary.scalar("stereo_smooth_loss", stereo_smooth_loss)) tf.summary.image("pred_disp", pred_disp[0][:, :, :, 0:1]) s = 0 tf.summary.histogram("pose_0-2", pred_poses[:, 0:3]) tf.summary.histogram("pose_3-5", pred_poses[:, 3:6]) tf.summary.image('scale%d_depth_image' % s, pred_depth[s][:, :, :, 0:1]) tf.summary.image('scale%d_right_disparity_image' % s, pred_disp[s][:, :, :, 1:2]) tf.summary.image('scale%d_target_image' % s, \ deprocess_image(tgt_image_all[s])) tf.summary.image('scale%d_src_image' % s, \ deprocess_image(src_image_all[s])) tf.summary.image('scale_projected_image', deprocess_image(proj_image_depth_all[s])) tf.summary.image('scale_proj_error_error', proj_error_depth_all[s]) tf.summary.image('scale_flyout_mask', flyout_map_all[s]) self.summ_op = tf.summary.merge(summaries)
def __init__(self, image1=None, image2=None, image1r=None, image2r=None, cam2pix=None, pix2cam=None, reuse_scope=False, scope=None): summaries = [] batch_size, H, W, color_channels = map(int, image1.get_shape()[0:4]) with tf.variable_scope(scope, reuse=reuse_scope): feature1_flow = feature_pyramid_flow(image1, reuse=False) feature2_flow = feature_pyramid_flow(image2, reuse=True) feature1_disp = feature_pyramid_disp(image1, reuse=False) feature1r_disp = feature_pyramid_disp(image1r, reuse=True) pred_disp, stereo_smooth_loss = disp_godard( image1, image1r, feature1_disp, feature1r_disp, opt, is_training=True) pred_depth = [1. / d for d in pred_disp] pred_poses = pose_exp_net(image1, image2) optical_flows_rev = construct_model_pwc_full( image2, image1, feature2_flow, feature1_flow) occu_masks = [ tf.clip_by_value( transformerFwd( tf.ones( shape=[batch_size, H / (2**s), W / (2**s), 1], dtype='float32'), flowr, [H / (2**s), W / (2**s)]), clip_value_min=0.0, clip_value_max=1.0) for s, flowr in enumerate(optical_flows_rev) ] pixel_loss_depth = 0 pixel_loss_optical = 0 exp_loss = 0 flow_smooth_loss = 0 tgt_image_all = [] src_image_all = [] proj_image_depth_all = [] proj_error_depth_all = [] exp_mask_stack_all = [] flyout_map_all = [] for s in range(opt.num_scales): # Scale the source and target images for computing loss at the # according scale. curr_tgt_image = tf.image.resize_area( image1, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) curr_src_image = tf.image.resize_area( image2, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) depth_flow, pose_mat = inverse_warp( pred_depth[s][:, :, :, 0:1], pred_poses, cam2pix[:, s, :, :], ## [batchsize, scale, 3, 3] pix2cam[:, s, :, :]) occu_mask = occu_masks[s] occu_mask_avg = tf.reduce_mean(occu_mask) curr_proj_image_depth = transformer_old(curr_src_image, depth_flow, [H / (2**s), W / (2**s)]) curr_proj_error_depth = tf.abs(curr_proj_image_depth - curr_tgt_image) pixel_loss_depth += (1.0 - opt.ssim_weight) * tf.reduce_mean( curr_proj_error_depth * occu_mask) / occu_mask_avg curr_flyout_map = occu_mask if opt.ssim_weight > 0: pixel_loss_depth += opt.ssim_weight * tf.reduce_mean( SSIM(curr_proj_image_depth * occu_mask, curr_tgt_image * occu_mask)) / occu_mask_avg tgt_image_all.append(curr_tgt_image) src_image_all.append(curr_src_image) proj_image_depth_all.append(curr_proj_image_depth) proj_error_depth_all.append(curr_proj_error_depth) flyout_map_all.append(curr_flyout_map) self.loss = (10.0 * pixel_loss_depth + stereo_smooth_loss) summaries.append(tf.summary.scalar("total_loss", self.loss)) summaries.append( tf.summary.scalar("pixel_loss_depth", pixel_loss_depth)) summaries.append( tf.summary.scalar("pixel_loss_optical", pixel_loss_optical)) summaries.append(tf.summary.scalar("exp_loss", exp_loss)) summaries.append( tf.summary.scalar("stereo_smooth_loss", stereo_smooth_loss)) tf.summary.image("pred_disp", pred_disp[0][:, :, :, 0:1]) # for s in range(opt.num_scales): s = 0 tf.summary.histogram("pose_0-2", pred_poses[:, 0:3]) tf.summary.histogram("pose_3-5", pred_poses[:, 3:6]) tf.summary.image('scale%d_depth_image' % s, pred_depth[s][:, :, :, 0:1]) tf.summary.image('scale%d_right_disparity_image' % s, pred_disp[s][:, :, :, 1:2]) tf.summary.image('scale%d_target_image' % s, \ deprocess_image(tgt_image_all[s])) tf.summary.image('scale%d_src_image' % s, \ deprocess_image(src_image_all[s])) tf.summary.image('scale_projected_image', deprocess_image(proj_image_depth_all[s])) tf.summary.image('scale_proj_error_error', proj_error_depth_all[s]) tf.summary.image('scale_flyout_mask', flyout_map_all[s]) self.summ_op = tf.summary.merge(summaries)
def __init__(self, image1=None, image2=None, image1r=None, image2r=None, cam2pix=None, pix2cam=None, reuse_scope=False, scope=None): summaries = [] batch_size, H, W, color_channels = map(int, image1.get_shape()[0:4]) with tf.variable_scope(scope, reuse=reuse_scope): feature1 = feature_pyramid_flow(image1, reuse=False) feature2 = feature_pyramid_flow(image2, reuse=True) optical_flows = construct_model_pwc_full(image1, image2, feature1, feature2) with tf.variable_scope(scope, reuse=True): optical_flows_rev = construct_model_pwc_full(image2, image1, feature2, feature1) occu_masks = [ tf.clip_by_value( transformerFwd( tf.ones( shape=[batch_size, H / (2**s), W / (2**s), 1], dtype='float32'), flowr, [H / (2**s), W / (2**s)]), clip_value_min=0.0, clip_value_max=1.0) for s, flowr in enumerate(optical_flows_rev) ] pixel_loss_depth = 0 pixel_loss_optical = 0 exp_loss = 0 flow_smooth_loss = 0 tgt_image_all = [] src_image_all = [] proj_image_depth_all = [] proj_error_depth_all = [] exp_mask_stack_all = [] flyout_map_all = [] for s in range(opt.num_scales): # Scale the source and target images for computing loss at the # according scale. curr_tgt_image = tf.image.resize_area( image1, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) curr_src_image = tf.image.resize_area( image2, [int(opt.img_height / (2**s)), int(opt.img_width / (2**s))]) occu_mask = occu_masks[s] occu_mask_avg = tf.reduce_mean(occu_mask) curr_proj_image_optical = transformer_old( curr_src_image, optical_flows[s], [H / (2**s), W / (2**s)]) curr_proj_error_optical = tf.abs(curr_proj_image_optical - curr_tgt_image) pixel_loss_optical += (1.0 - opt.ssim_weight) * tf.reduce_mean( curr_proj_error_optical * occu_mask) / occu_mask_avg curr_flyout_map = occu_mask if opt.ssim_weight > 0: pixel_loss_optical += opt.ssim_weight * tf.reduce_mean( SSIM(curr_proj_image_optical * occu_mask, curr_tgt_image * occu_mask)) / occu_mask_avg flow_smooth_loss += opt.flow_smooth_weight * cal_grad2_error( optical_flows[s] / 20.0, curr_tgt_image, 1.0) tgt_image_all.append(curr_tgt_image) src_image_all.append(curr_src_image) proj_image_depth_all.append(curr_proj_image_optical) proj_error_depth_all.append(curr_proj_error_optical) flyout_map_all.append(curr_flyout_map) self.loss = (pixel_loss_optical + flow_smooth_loss) summaries.append(tf.summary.scalar("total_loss", self.loss)) summaries.append( tf.summary.scalar("pixel_loss_depth", pixel_loss_depth)) summaries.append( tf.summary.scalar("pixel_loss_optical", pixel_loss_optical)) summaries.append(tf.summary.scalar("exp_loss", exp_loss)) tf.summary.image('scale%d_target_image' % s, \ deprocess_image(tgt_image_all[s])) tf.summary.image('scale%d_src_image' % s, \ deprocess_image(src_image_all[s])) tf.summary.image('scale_projected_image', deprocess_image(proj_image_depth_all[s])) tf.summary.image('scale_proj_error_error', proj_error_depth_all[s]) tf.summary.image('scale_flyout_mask', flyout_map_all[s]) self.summ_op = tf.summary.merge(summaries)
def inverse_warp_new(depth1, depth2, pose, intrinsics, intrinsics_inv, flow_input, occu_mask, pose_mat_inverse=False): """ Inverse warp a source image to the target image plane after refining the pose by rigid alignment described in 'Joint Unsupervised Learning of Optical Flow and Depth by Watching Stereo Videos by Yang Wang et al.' Args: depth1: depth map of the target image -- [B, H, W] depth2: depth map of the source image -- [B, H, W] pose: 6DoF pose parameters from target to source -- [B, 6] intrinsics: camera intrinsic matrix -- [B, 3, 3] intrinsics_inv: inverse of the intrinsic matrix -- [B, 3, 3] flow_input: flow between target and source image -- [B, H, W, 2] occu_mask: occlusion mask of target image -- [B, H, W, 1] Returns: [optical flow induced by refined pose, refined pose matrix, disparity of the target frame transformed by refined pose, the mask for areas used for rigid alignment] """ def _pixel2cam(depth, pixel_coords, intrinsics_inv): """Transform coordinates in the pixel frame to the camera frame""" cam_coords = tf.matmul(intrinsics_inv, pixel_coords) * depth return cam_coords def _repeat(x, n_repeats): with tf.variable_scope('_repeat'): rep = tf.transpose( tf.expand_dims(tf.ones(shape=tf.stack([ n_repeats, ])), 1), [1, 0]) rep = tf.cast(rep, 'int32') x = tf.matmul(tf.reshape(x, (-1, 1)), rep) return tf.reshape(x, [-1]) def _cam2pixel(cam_coords, proj_c2p): """Transform coordinates in the camera frame to the pixel frame""" pcoords = tf.matmul(proj_c2p, cam_coords) X = tf.slice(pcoords, [0, 0, 0], [-1, 1, -1]) Y = tf.slice(pcoords, [0, 1, 0], [-1, 1, -1]) Z = tf.slice(pcoords, [0, 2, 0], [-1, 1, -1]) # Not tested if adding a small number is necessary X_norm = X / (Z + 1e-10) Y_norm = Y / (Z + 1e-10) pixel_coords = tf.concat([X_norm, Y_norm], axis=1) return pixel_coords def _meshgrid_abs(height, width): """Meshgrid in the absolute coordinates""" x_t = tf.matmul( tf.ones(shape=tf.stack([height, 1])), tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0])) y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1), tf.ones(shape=tf.stack([1, width]))) x_t = (x_t + 1.0) * 0.5 * tf.cast(width, tf.float32) y_t = (y_t + 1.0) * 0.5 * tf.cast(height, tf.float32) x_t_flat = tf.reshape(x_t, (1, -1)) y_t_flat = tf.reshape(y_t, (1, -1)) ones = tf.ones_like(x_t_flat) grid = tf.concat([x_t_flat, y_t_flat, ones], axis=0) return grid def _meshgrid_abs_xy(batch, height, width): """Meshgrid in the absolute coordinates""" x_t = tf.matmul( tf.ones(shape=tf.stack([height, 1])), tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0])) y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1), tf.ones(shape=tf.stack([1, width]))) x_t = (x_t + 1.0) * 0.5 * tf.cast(width, tf.float32) y_t = (y_t + 1.0) * 0.5 * tf.cast(height, tf.float32) return tf.tile(tf.expand_dims(x_t, 0), [batch, 1, 1]), tf.tile(tf.expand_dims(y_t, 0), [batch, 1, 1]) def _euler2mat(z, y, x): """Converts euler angles to rotation matrix TODO: remove the dimension for 'N' (deprecated for converting all source poses altogether) Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174 Args: z: rotation angle along z axis (in radians) -- size = [B, N] y: rotation angle along y axis (in radians) -- size = [B, N] x: rotation angle along x axis (in radians) -- size = [B, N] Returns: Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3] """ B = tf.shape(z)[0] N = 1 z = tf.clip_by_value(z, -np.pi, np.pi) y = tf.clip_by_value(y, -np.pi, np.pi) x = tf.clip_by_value(x, -np.pi, np.pi) # Expand to B x N x 1 x 1 z = tf.expand_dims(tf.expand_dims(z, -1), -1) y = tf.expand_dims(tf.expand_dims(y, -1), -1) x = tf.expand_dims(tf.expand_dims(x, -1), -1) zeros = tf.zeros([B, N, 1, 1]) ones = tf.ones([B, N, 1, 1]) cosz = tf.cos(z) sinz = tf.sin(z) rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3) rotz_2 = tf.concat([sinz, cosz, zeros], axis=3) rotz_3 = tf.concat([zeros, zeros, ones], axis=3) zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2) cosy = tf.cos(y) siny = tf.sin(y) roty_1 = tf.concat([cosy, zeros, siny], axis=3) roty_2 = tf.concat([zeros, ones, zeros], axis=3) roty_3 = tf.concat([-siny, zeros, cosy], axis=3) ymat = tf.concat([roty_1, roty_2, roty_3], axis=2) cosx = tf.cos(x) sinx = tf.sin(x) rotx_1 = tf.concat([ones, zeros, zeros], axis=3) rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3) rotx_3 = tf.concat([zeros, sinx, cosx], axis=3) xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2) rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat) return rotMat def _pose_vec2mat(vec): """Converts 6DoF parameters to transformation matrix Args: vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6] Returns: A transformation matrix -- [B, 4, 4] """ translation = tf.slice(vec, [0, 0], [-1, 3]) translation = tf.expand_dims(translation, -1) rx = tf.slice(vec, [0, 3], [-1, 1]) ry = tf.slice(vec, [0, 4], [-1, 1]) rz = tf.slice(vec, [0, 5], [-1, 1]) rot_mat = _euler2mat(rz, ry, rx) rot_mat = tf.squeeze(rot_mat, squeeze_dims=[1]) filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) filler = tf.tile(filler, [batch_size, 1, 1]) transform_mat = tf.concat([rot_mat, translation], axis=2) transform_mat = tf.concat([transform_mat, filler], axis=1) return transform_mat dims = tf.shape(depth1) batch_size, img_height, img_width = dims[0], dims[1], dims[2] depth1 = tf.reshape(depth1, [batch_size, 1, img_height * img_width]) grid = _meshgrid_abs(img_height, img_width) grid = tf.tile(tf.expand_dims(grid, 0), [batch_size, 1, 1]) # Point Cloud Q_1 cam_coords1 = _pixel2cam(depth1, grid, intrinsics_inv) ones = tf.ones([batch_size, 1, img_height * img_width]) cam_coords1_hom = tf.concat([cam_coords1, ones], axis=1) if len(pose.get_shape().as_list()) == 3: pose_mat = pose else: pose_mat = _pose_vec2mat(pose) if pose_mat_inverse: pose_mat = tf.matrix_inverse(pose_mat) # Point Cloud \hat{Q_1} cam_coords1_trans = tf.matmul(pose_mat, cam_coords1_hom)[:, 0:3, :] depth2 = tf.reshape(depth2, [batch_size, 1, img_height * img_width]) # Point Cloud Q_2 cam_coords2 = _pixel2cam(depth2, grid, intrinsics_inv) cam_coords2 = tf.reshape(cam_coords2, [batch_size, 3, img_height, img_width]) cam_coords2 = tf.transpose(cam_coords2, [0, 2, 3, 1]) cam_coords2_trans = transformer_old(cam_coords2, flow_input, [img_height, img_width]) # Point Cloud \tilda{Q_1} cam_coords2_trans = tf.reshape( tf.transpose(cam_coords2_trans, [0, 3, 1, 2]), [batch_size, 3, -1]) occu_mask = tf.reshape(occu_mask, [batch_size, 1, -1]) # To eliminate occluded area from the small_mask occu_mask = tf.where(occu_mask < 0.75, tf.ones_like(occu_mask) * 10000.0, tf.ones_like(occu_mask)) diff2 = tf.sqrt( tf.reduce_sum(tf.square(cam_coords1_trans - cam_coords2_trans), axis=1, keep_dims=True)) * occu_mask small_mask = tf.where( diff2 < tf.contrib.distributions.percentile( diff2, 25.0, axis=2, keep_dims=True), tf.ones_like(diff2), tf.zeros_like(diff2)) # Delta T rigid_pose_mat = calculate_pose_basis(cam_coords1_trans, cam_coords2_trans, small_mask, batch_size) # T' = deltaT x T pose_mat2 = tf.matmul(rigid_pose_mat, pose_mat) # Get projection matrix for tgt camera frame to source pixel frame hom_filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) hom_filler = tf.tile(hom_filler, [batch_size, 1, 1]) intrinsics = tf.concat([intrinsics, tf.zeros([batch_size, 3, 1])], axis=2) intrinsics = tf.concat([intrinsics, hom_filler], axis=1) proj_cam_to_src_pixel = tf.matmul(intrinsics, pose_mat2) src_pixel_coords = _cam2pixel(cam_coords1_hom, proj_cam_to_src_pixel) src_pixel_coords = tf.reshape(src_pixel_coords, [batch_size, 2, img_height, img_width]) src_pixel_coords = tf.transpose(src_pixel_coords, perm=[0, 2, 3, 1]) tgt_pixel_coords_x, tgt_pixel_coords_y = _meshgrid_abs_xy( batch_size, img_height, img_width) flow_x = src_pixel_coords[:, :, :, 0] - tgt_pixel_coords_x flow_y = src_pixel_coords[:, :, :, 1] - tgt_pixel_coords_y flow = tf.concat([tf.expand_dims(flow_x, -1), tf.expand_dims(flow_y, -1)], axis=-1) cam_coords1_trans_z = tf.matmul(pose_mat2, cam_coords1_hom)[:, 2:3, :] cam_coords1_trans_z = tf.reshape(cam_coords1_trans_z, [batch_size, img_height, img_width, 1]) disp1_trans = 1.0 / cam_coords1_trans_z return flow, pose_mat2, disp1_trans, tf.reshape( small_mask, [batch_size, img_height, img_width, 1])
def construct_model_pwc_full_disp(feature1, feature2, image1, neg=False): batch_size, H, W, color_channels = map(int, image1.get_shape()[0:4]) ############################# feature1_1, feature1_2, feature1_3, feature1_4, feature1_5, feature1_6 = feature1 feature2_1, feature2_2, feature2_3, feature2_4, feature2_5, feature2_6 = feature2 cv6 = cost_volumn(feature1_6, feature2_6, d=4) flow6, _ = optical_flow_decoder_dc(cv6, level=6) if neg: flow6 = -tf.nn.relu(-flow6) else: flow6 = tf.nn.relu(flow6) flow6to5 = tf.image.resize_bilinear(flow6, [H / (2**5), (W / (2**5))]) * 2.0 feature2_5w = transformer_old(feature2_5, flow6to5, [H / 32, W / 32]) cv5 = cost_volumn(feature1_5, feature2_5w, d=4) flow5, _ = optical_flow_decoder_dc( tf.concat( [cv5, feature1_5, flow6to5], axis=3), level=5) flow5 = flow5 + flow6to5 if neg: flow5 = -tf.nn.relu(-flow5) else: flow5 = tf.nn.relu(flow5) flow5to4 = tf.image.resize_bilinear(flow5, [H / (2**4), (W / (2**4))]) * 2.0 feature2_4w = transformer_old(feature2_4, flow5to4, [H / 16, W / 16]) cv4 = cost_volumn(feature1_4, feature2_4w, d=4) flow4, _ = optical_flow_decoder_dc( tf.concat( [cv4, feature1_4, flow5to4[:, :, :, 0:1]], axis=3), level=4) flow4 = flow4 + flow5to4 if neg: flow4 = -tf.nn.relu(-flow4) else: flow4 = tf.nn.relu(flow4) flow4to3 = tf.image.resize_bilinear(flow4, [H / (2**3), (W / (2**3))]) * 2.0 feature2_3w = transformer_old(feature2_3, flow4to3, [H / 8, W / 8]) cv3 = cost_volumn(feature1_3, feature2_3w, d=4) flow3, _ = optical_flow_decoder_dc( tf.concat( [cv3, feature1_3, flow4to3[:, :, :, 0:1]], axis=3), level=3) flow3 = flow3 + flow4to3 if neg: flow3 = -tf.nn.relu(-flow3) else: flow3 = tf.nn.relu(flow3) flow3to2 = tf.image.resize_bilinear(flow3, [H / (2**2), (W / (2**2))]) * 2.0 feature2_2w = transformer_old(feature2_2, flow3to2, [H / 4, W / 4]) cv2 = cost_volumn(feature1_2, feature2_2w, d=4) flow2_raw, f2 = optical_flow_decoder_dc( tf.concat( [cv2, feature1_2, flow3to2[:, :, :, 0:1]], axis=3), level=2) flow2_raw = flow2_raw + flow3to2 if neg: flow2_raw = -tf.nn.relu(-flow2_raw) else: flow2_raw = tf.nn.relu(flow2_raw) flow2 = context_net(tf.concat( [flow2_raw[:, :, :, 0:1], f2], axis=3)) + flow2_raw if neg: flow2 = -tf.nn.relu(-flow2) else: flow2 = tf.nn.relu(flow2) disp0 = tf.image.resize_bilinear(flow2[:, :, :, 0:1] / (W / (2**2)), [H, W]) disp1 = tf.image.resize_bilinear(flow3[:, :, :, 0:1] / (W / (2**3)), [H // 2, W // 2]) disp2 = tf.image.resize_bilinear(flow4[:, :, :, 0:1] / (W / (2**4)), [H // 4, W // 4]) disp3 = tf.image.resize_bilinear(flow5[:, :, :, 0:1] / (W / (2**5)), [H // 8, W // 8]) if neg: return -disp0, -disp1, -disp2, -disp3 else: return disp0, disp1, disp2, disp3