def hier_homography_fmask_estimator(color_inputs, num_param=8, num_layer=7, num_level=3, dropout_keep_prob=0.8, reuse=None, is_training=True, trainable=True, scope='hier_hmg'): """A hierarchical neural network with mask for homograhy estimation. Args: color_inputs: batch of input image pairs of data type float32 and of shape [batch_size, height, width, 6] num_param: the number of parameters for homography (default 8) num_layer: the number of convolutional layers in the motion feature network num_level: the number of hierarchical levels dropout_keep_prob: the percentage of activation values that are kept reuse: whether to reuse this network weights is_training: whether used for training or testing trainable: whether this network is to be trained or not scope: the scope of variables in this function Returns: a list of homographies at each level and motion feature maps if final_endpoint='mfeature'; otherwise a list of images warped by the list of corresponding homographies """ _, h_input, w_input = color_inputs.get_shape().as_list()[0 : 3] vgg_inputs = (color_inputs[Ellipsis, 3 : 6] * 256 + 128)- VGG_MEANS with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'): with slim.arg_scope([slim.conv2d, slim.fully_connected], trainable=False): with slim.arg_scope([slim.conv2d], normalizer_fn=None): with slim.arg_scope(contrib_slim_nets_vgg.vgg_arg_scope()): sfeature, _ = contrib_slim_nets_vgg.vgg_16( vgg_inputs, 1000, predictions_fn=slim.softmax, global_pool=False, is_training=False, reuse=reuse, spatial_squeeze=True, final_endpoint='pool5', scope='vgg_16') gray_image1 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 0 : 3]) gray_image2 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 3 : 6]) inputs = tf.concat([gray_image1, gray_image2], 3) hmgs_list = [] warped_list = [] with tf.variable_scope(scope, [inputs], reuse=reuse): for level_index in range(num_level): scale = 2 ** (num_level - 1 - level_index) h = tf.to_float(tf.floordiv(h_input, scale)) w = tf.to_float(tf.floordiv(w_input, scale)) inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w])) if level_index == 0: mfeature = hier_base_layers(inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) hmgs_list.append(hmgs_il) else: warped, _ = hmg_util.homography_scale_warp_per_batch( inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1]) pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1) warped_list.append(pre_warped_inputs_il) mfeature = hier_base_layers(pre_warped_inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) if level_index == num_level - 1: mfeature = fmask_layers_semantic(mfeature, sfeature, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) new_hmgs_il = hmg_util.homography_shift_mult_batch( hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h) hmgs_list.append(new_hmgs_il) return hmgs_list, warped_list
def hier_homography_estimator(inputs, num_param=8, num_layer=7, num_level=3, dropout_keep_prob=0.8, reuse=None, is_training=True, trainable=True, final_endpoint=None, scope='hier_hmg'): """A hierarchical VGG-style neural network for homograhy estimation. Args: inputs: batch of input image pairs of data type float32 and of shape [batch_size, height, width, 2] num_param: the number of parameters for homography (default 8) num_layer: the number of convolutional layers in the motion feature network num_level: the number of hierarchical levels dropout_keep_prob: the percentage of activation values that are kept reuse: whether to reuse this network weights is_training: whether used for training or testing trainable: whether this network is to be trained or not final_endpoint: specifies the endpoint to construct the network up to scope: the scope of variables in this function Returns: a list of homographies at each level and motion feature maps if final_endpoint='mfeature'; otherwise a list of images warped by the list of corresponding homographies """ _, h_input, w_input = inputs.get_shape().as_list()[0:3] hmgs_list = [] warped_list = [] with tf.variable_scope(scope, [inputs], reuse=reuse): for level_index in range(num_level): scale = 2 ** (num_level - 1 - level_index) h = tf.to_float(tf.floordiv(h_input, scale)) w = tf.to_float(tf.floordiv(w_input, scale)) inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w])) if level_index == 0: mfeature = hier_base_layers(inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) hmgs_list.append(hmgs_il) else: warped, _ = hmg_util.homography_scale_warp_per_batch( inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1]) pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1) warped_list.append(pre_warped_inputs_il) if level_index == num_level - 1 and final_endpoint == 'mfeature': mfeature = hier_base_layers(pre_warped_inputs_il, num_layer - num_level + level_index, level_index, is_training=is_training, trainable=trainable) return hmgs_list, mfeature else: mfeature = hier_base_layers(pre_warped_inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) new_hmgs_il = hmg_util.homography_shift_mult_batch( hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h) hmgs_list.append(new_hmgs_il) return hmgs_list, warped_list