def inference(images, cams, depth_num, depth_start, depth_interval, is_master_gpu=True): """ infer depth image from multi-view images and cameras """ # dynamic gpu params depth_end = depth_start + (tf.cast(depth_num, tf.float32) - 1) * depth_interval # reference image ref_image = tf.squeeze(tf.slice(images, [0, 0, 0, 0, 0], [-1, 1, -1, -1, 3]), axis=1) ref_cam = tf.squeeze(tf.slice(cams, [0, 0, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) # image feature extraction if is_master_gpu: ref_tower = UniNetDS2({'data': ref_image}, is_training=True, reuse=False) # ref_tower = UniNetDS2({'data': ref_image}, is_training=False, reuse=False) else: ref_tower = UniNetDS2({'data': ref_image}, is_training=True, reuse=True) # ref_tower = UniNetDS2({'data': ref_image}, is_training=False, reuse=True) view_towers = [] for view in range(1, FLAGS.view_num): view_image = tf.squeeze(tf.slice(images, [0, view, 0, 0, 0], [-1, 1, -1, -1, -1]), axis=1) view_tower = UniNetDS2({'data': view_image}, is_training=True, reuse=True) # view_tower = UniNetDS2({'data': view_image}, is_training=False, reuse=True) view_towers.append(view_tower) # get all homographies view_homographies = [] for view in range(1, FLAGS.view_num): view_cam = tf.squeeze(tf.slice(cams, [0, view, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) homographies = get_homographies(ref_cam, view_cam, depth_num=depth_num, depth_start=depth_start, depth_interval=depth_interval) view_homographies.append(homographies) # build cost volume by differentialble homography with tf.name_scope('cost_volume_homography'): depth_costs = [] for d in range(depth_num): # compute cost (variation metric) ave_feature = ref_tower.get_output() ave_feature2 = tf.square(ref_tower.get_output()) for view in range(0, FLAGS.view_num - 1): homography = tf.slice(view_homographies[view], begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_view_feature = homography_warping(view_towers[view].get_output(), homography) ave_feature = ave_feature + warped_view_feature ave_feature2 = ave_feature2 + tf.square(warped_view_feature) ave_feature = ave_feature / FLAGS.view_num ave_feature2 = ave_feature2 / FLAGS.view_num cost = ave_feature2 - tf.square(ave_feature) depth_costs.append(cost) cost_volume = tf.stack(depth_costs, axis=1) # filtered cost volume, size of (B, D, H, W, 1) if is_master_gpu: filtered_cost_volume_tower = RegNetUS0({'data': cost_volume}, is_training=True, reuse=False) else: filtered_cost_volume_tower = RegNetUS0({'data': cost_volume}, is_training=True, reuse=True) filtered_cost_volume = tf.squeeze(filtered_cost_volume_tower.get_output(), axis=-1) # depth map by softArgmin with tf.name_scope('soft_arg_min'): # probability volume by soft max probability_volume = tf.nn.softmax( tf.scalar_mul(-1, filtered_cost_volume), dim=1, name='prob_volume') # depth image by soft argmin volume_shape = tf.shape(probability_volume) soft_2d = [] for i in range(FLAGS.batch_size): soft_1d = tf.linspace(depth_start[i], depth_end[i], tf.cast(depth_num, tf.int32)) soft_2d.append(soft_1d) soft_2d = tf.reshape(tf.stack(soft_2d, axis=0), [volume_shape[0], volume_shape[1], 1, 1]) soft_4d = tf.tile(soft_2d, [1, 1, volume_shape[2], volume_shape[3]]) estimated_depth_map = tf.reduce_sum(soft_4d * probability_volume, axis=1) estimated_depth_map = tf.expand_dims(estimated_depth_map, axis=3) # probability map prob_map = get_propability_map(probability_volume, estimated_depth_map, depth_start, depth_interval) return estimated_depth_map, prob_map,ref_tower,view_towers#, filtered_depth_map, probability_volume
def refinement(init_depth_images, cams, depth_num, depth_start, depth_interval, images, prob_vol, ref_id, view_id, view_homographies=None, num_depths=None, depth_ref_id=None, depth_view_id=None): # input: N=2 # init_depth_images (B, N, H, W, 1) # cams (B, N, 2, 4, 4) # view_homographies (B, D, 3, 3) # images (B, N, H, W, 3), N=2 # prob_vol (B, D, H, W) # output: # refined_prob_vol (B, D, H, W) if depth_ref_id == None: depth_ref_id = ref_id if depth_view_id == None: depth_view_id = view_id if num_depths == None: num_depths = FLAGS.view_num prob_vol = tf.expand_dims(prob_vol, axis=-1) init_depth_image = tf.squeeze(tf.slice(init_depth_images, [0, depth_ref_id, 0, 0, 0], [-1, 1, -1, -1, 1]), axis=1) # (B, H, W, 1) init_depth_image_view = tf.squeeze(tf.slice(init_depth_images, [0, depth_view_id, 0, 0, 0], [-1, 1, -1, -1, 1]), axis=1) # (B, H, W, 1) ref_cam = tf.squeeze(tf.slice(cams, [0, ref_id, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) view_cam = tf.squeeze(tf.slice(cams, [0, view_id, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) init_depth_image_view_trans = transform_depth( init_depth_image_view, view_cam, ref_cam) # depth value to ref coor (B, H, W, 1) if view_homographies is None: view_homographies = get_homographies(ref_cam, view_cam, depth_num=depth_num, depth_start=depth_start, depth_interval=depth_interval) # image feature extraction ref_feature, view_feature = extract_feature_shallow( images, ref_id, view_id) chan_num = ref_feature.get_shape()[3] ##### # photometric cost volume (L1 norm) with low level feature ##### with tf.name_scope('global_refine_photo_cost_volume'): # view2ref cost volume photo_costs = [] for d in range(depth_num): homography = tf.slice(view_homographies, begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_view_feature, valid_mask = homography_warping( view_feature, homography, output_mask=True) err_photo_L1 = tf.multiply( tf.abs(warped_view_feature - ref_feature), tf.cast(tf.tile(valid_mask, [1, 1, 1, chan_num]), ref_feature.dtype)) photo_costs.append(err_photo_L1) cost_vol_photo = tf.stack(photo_costs, axis=1) # size of (B, D, H, W, F) ##### # geometric cost volume (L1 norm) ##### with tf.name_scope('global_refine_geo_cost_volume'): cost_volume_geo_view = [] cost_volume_geo_ref = [] for d in range(depth_num): ref_depth_value = depth_start + tf.cast( d, tf.float32) * depth_interval cost_volume_geo_ref.append( tf.abs(init_depth_image - ref_depth_value) / depth_interval / tf.cast(depth_num, tf.float32)) homography = tf.slice(view_homographies, begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_view_init_depth_inRef, view_valid_mask = homography_warping( init_depth_image_view_trans, homography, output_mask=True) geo_errors = tf.multiply( tf.abs(warped_view_init_depth_inRef - ref_depth_value) / depth_interval / tf.cast(depth_num, tf.float32), tf.cast(tf.tile(view_valid_mask, [1, 1, 1, chan_num]), init_depth_image.dtype)) cost_volume_geo_view.append(geo_errors) cost_volume_geo_ref = tf.stack(cost_volume_geo_ref, axis=1) # size of (B, D, H, W, 1) cost_volume_geo = tf.concat( [cost_volume_geo_ref, tf.stack(cost_volume_geo_view, axis=1)], axis=-1) # size of (B, D, H, W, 2) ##### # photo and geo error ##### with tf.name_scope('global_refine_photo_geo_error'): # prob_vol_normalized = tf.nn.softmax(tf.scalar_mul(-1, prob_vol), axis=1, name='prob_volume') # photo error warped_feature, valid_mask_photo = homography_warping_by_depth( view_feature, ref_cam, view_cam, init_depth_image, output_mask=True) photo_err = tf.multiply( tf.abs(warped_feature - ref_feature), tf.cast( tf.tile(valid_mask_photo, [1, 1, 1, ref_feature.get_shape()[3]]), ref_feature.dtype)) photo_err = tf.tile(tf.expand_dims(photo_err, axis=1), [1, depth_num, 1, 1, 1]) # geo error view_depth_warped2ref, valid_mask_geo = homography_warping_by_depth( init_depth_image_view_trans, ref_cam, view_cam, init_depth_image, output_mask=True, method='nearest') geo_err = tf.multiply(tf.abs(view_depth_warped2ref - init_depth_image), tf.cast(valid_mask_geo, init_depth_image.dtype)) geo_err = tf.tile(tf.expand_dims(geo_err, axis=1), [1, depth_num, 1, 1, 1]) ##### # visual hull ##### with tf.name_scope('global_refine_visual_hull'): # visual_hull in(B, N, H, W), out(B, D, H, W, 1) vis_hull = get_visual_hull(tf.squeeze(init_depth_images, axis=-1), cams, depth_num, depth_start, depth_interval, ref_id=ref_id, view_num=num_depths) ##### # refinement network ##### ref_cost_volume = tf.tile(tf.expand_dims(ref_feature, axis=1), [1, depth_num, 1, 1, 1]) ref_geo_volume = tf.tile(tf.expand_dims(init_depth_image, axis=1), [1, depth_num, 1, 1, 1]) prob_vol_residual_tower = CostVolRefineNet( { 'photo_group': tf.concat([cost_vol_photo, photo_err, ref_cost_volume], axis=-1), 'geo_group': tf.concat([cost_volume_geo, geo_err, ref_geo_volume], axis=-1), 'prob_vol': prob_vol, 'vis_hull': vis_hull }, is_training=True, reuse=tf.AUTO_REUSE) return prob_vol_residual_tower.get_output_by_name( 'global_refine_3dconv6_1'), tf.squeeze( prob_vol_residual_tower.get_output(), axis=-1)
def inference_mem(images, cams, depth_num, depth_start, depth_interval, is_master_gpu=True): """ infer depth image from multi-view images and cameras """ # dynamic gpu params depth_end = depth_start + (tf.cast(depth_num, tf.float32) - 1) * depth_interval feature_c = 32 feature_h = FLAGS.max_h / 4 feature_w = FLAGS.max_w / 4 # reference image ref_image = tf.squeeze(tf.slice(images, [0, 0, 0, 0, 0], [-1, 1, -1, -1, 3]), axis=1) ref_cam = tf.squeeze(tf.slice(cams, [0, 0, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) # image feature extraction if is_master_gpu: ref_tower = UniNetDS2({'data': ref_image}, is_training=True, reuse=tf.AUTO_REUSE) # ref_tower = UniNetDS2({'data': ref_image}, is_training=False, reuse=False) else: ref_tower = UniNetDS2({'data': ref_image}, is_training=True, reuse=True) # ref_tower = UniNetDS2({'data': ref_image}, is_training=False, reuse=True) ref_feature = ref_tower.get_output() ref_feature2 = tf.square(ref_feature) view_features = [] for view in range(1, FLAGS.view_num): view_image = tf.squeeze(tf.slice(images, [0, view, 0, 0, 0], [-1, 1, -1, -1, -1]), axis=1) view_tower = UniNetDS2({'data': view_image}, is_training=True, reuse=tf.AUTO_REUSE) # view_tower = UniNetDS2({'data': view_image}, is_training=False, reuse=True) view_features.append(view_tower.get_output()) view_features = tf.stack(view_features, axis=0) # get all homographies view_homographies = [] for view in range(1, FLAGS.view_num): view_cam = tf.squeeze(tf.slice(cams, [0, view, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) homographies = get_homographies(ref_cam, view_cam, depth_num=depth_num, depth_start=depth_start, depth_interval=depth_interval) view_homographies.append(homographies) view_homographies = tf.stack(view_homographies, axis=0) # build cost volume by differentialble homography with tf.name_scope('cost_volume_homography'): depth_costs = [] for d in range(depth_num): # compute cost (standard deviation feature) ave_feature = tf.Variable(tf.zeros( [FLAGS.batch_size, feature_h, feature_w, feature_c]), name='ave', trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) ave_feature2 = tf.Variable(tf.zeros( [FLAGS.batch_size, feature_h, feature_w, feature_c]), name='ave2', trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) ave_feature = tf.assign(ave_feature, ref_feature) ave_feature2 = tf.assign(ave_feature2, ref_feature2) def body(view, ave_feature, ave_feature2): """Loop body.""" homography = tf.slice(view_homographies[view], begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_view_feature = homography_warping(view_features[view], homography) ave_feature = tf.assign_add(ave_feature, warped_view_feature) ave_feature2 = tf.assign_add(ave_feature2, tf.square(warped_view_feature)) view = tf.add(view, 1) return view, ave_feature, ave_feature2 view = tf.constant(0) cond = lambda view, *_: tf.less(view, FLAGS.view_num - 1) _, ave_feature, ave_feature2 = tf.while_loop( cond, body, [view, ave_feature, ave_feature2], back_prop=False, parallel_iterations=1) ave_feature = tf.assign(ave_feature, tf.square(ave_feature) / (FLAGS.view_num * FLAGS.view_num)) ave_feature2 = tf.assign(ave_feature2, ave_feature2 / FLAGS.view_num - ave_feature) depth_costs.append(ave_feature2) cost_volume = tf.stack(depth_costs, axis=1) # filtered cost volume, size of (B, D, H, W, 1) if is_master_gpu: filtered_cost_volume_tower = RegNetUS0({'data': cost_volume}, is_training=True, reuse=tf.AUTO_REUSE) else: filtered_cost_volume_tower = RegNetUS0({'data': cost_volume}, is_training=True, reuse=True) filtered_cost_volume = tf.squeeze(filtered_cost_volume_tower.get_output(), axis=-1) # depth map by softArgmin with tf.name_scope('soft_arg_min'): # probability volume by soft max probability_volume = tf.nn.softmax(tf.scalar_mul(-1, filtered_cost_volume), axis=1, name='prob_volume') # depth image by soft argmin volume_shape = tf.shape(probability_volume) soft_2d = [] for i in range(FLAGS.batch_size): soft_1d = tf.linspace(depth_start[i], depth_end[i], tf.cast(depth_num, tf.int32)) soft_2d.append(soft_1d) soft_2d = tf.reshape(tf.stack(soft_2d, axis=0), [volume_shape[0], volume_shape[1], 1, 1]) soft_4d = tf.tile(soft_2d, [1, 1, volume_shape[2], volume_shape[3]]) estimated_depth_map = tf.reduce_sum(soft_4d * probability_volume, axis=1) estimated_depth_map = tf.expand_dims(estimated_depth_map, axis=3) # probability map prob_map = get_propability_map(probability_volume, estimated_depth_map, depth_start, depth_interval) # filtered_depth_map = tf.cast(tf.greater_equal(prob_map, 0.8), dtype='float32') * estimated_depth_map return estimated_depth_map, prob_map
def build_cost_volume(ref_feature, view_feature, cams, depth_num, depth_start, depth_interval, ref_id, view_id, output_homo=False, warp_ref=False): # input: # ref_feature/view_feature (B, H, W, F) # cams (B, N, 2, 4, 4), N=2 # output: # cost_volume (B, D, H, W, 2F) # *view_homographies (B, D, 3, 3) # reference cam ref_cam = tf.squeeze(tf.slice(cams, [0, ref_id, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) # get view homographies view_cam = tf.squeeze(tf.slice(cams, [0, view_id, 0, 0, 0], [-1, 1, 2, 4, 4]), axis=1) view_homographies = get_homographies(ref_cam, view_cam, depth_num=depth_num, depth_start=depth_start, depth_interval=depth_interval) # build cost volume by differentialble homography with tf.name_scope('cost_volume_homography'): # ref2ref cost volume if warp_ref: depth_costs = [] ref_homography = get_homographies(ref_cam, ref_cam, depth_num=depth_num, depth_start=depth_start, depth_interval=depth_interval) for d in range(depth_num): homography = tf.slice(ref_homography, begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_ref_feature = homography_warping( ref_feature, homography) depth_costs.append(warped_ref_feature) cost_volume = tf.stack(depth_costs, axis=1) # size of (B, D, H, W, F) else: cost_volume = tf.tile(tf.expand_dims(ref_feature, axis=1), [1, depth_num, 1, 1, 1]) # view2ref cost volume depth_costs = [] for d in range(depth_num): homography = tf.slice(view_homographies, begin=[0, d, 0, 0], size=[-1, 1, 3, 3]) homography = tf.squeeze(homography, axis=1) warped_view_feature = homography_warping(view_feature, homography) depth_costs.append(warped_view_feature) cost_volume = tf.concat( [cost_volume, tf.stack(depth_costs, axis=1)], axis=-1) # size of (B, D, H, W, 2F) if output_homo: return cost_volume, view_homographies else: return cost_volume