def res_fcn_32s(inputs, num_classes, is_training): with tf.variable_scope('res_fcn_32s'): # Use the structure of res_v1_50 classification network with slim.arg_scope(resnet_v1.resnet_arg_scope()): net, end_points = resnet_v1.resnet_v1_50(inputs, num_classes, is_training=is_training, global_pool=False, output_stride=32) # Deconvolutional layers to recover the size of input image # Padding is 'SAME' for conv layers thus conv layers do not change the size # There are 5 max-pool layers with size reduced by half # Totally size reduced by scale of 2^5 = 32 times # That's also the reason why this model is called fcn_32s # Use bilinear interpolation for upsampling upsample_filter = upsampling.bilinear_upsample_weights(32, num_classes) upsample_filter_tensor = tf.constant(upsample_filter) shape = tf.shape(net) output = tf.nn.conv2d_transpose(net, upsample_filter_tensor, output_shape = tf.stack([shape[0], shape[1] * 32, shape[2] * 32, shape[3]]), strides=[1, 32, 32, 1]) variables = slim.get_variables('res_fcn_32s') # Extract variables that are the same as original vgg-16, they could be intialized # with pre-trained vgg-16 network res_variables = {} for variable in variables: res_variables[variable.name[12:-2]] = variable return output, res_variables
def fcn16s(self, net, reuse=False): with tf.variable_scope('fcn16s') as scope: if reuse: tf.get_variable_scope().reuse_variables() upscore_weights = bilinear_upsample_weights(2, self.n_classes) output_weights = bilinear_upsample_weights(16, self.n_classes) upscore_weights_tensor = tf.constant(upscore_weights) output_weights_tensor = tf.constant(output_weights) ## Score from pool4: pool4_score = slim.convolution2d(self.pool4, self.n_classes, 1, 1, padding='SAME', scope='pool4_score', reuse=reuse) pool4_h, pool4_w = pool4_score.get_shape().as_list()[1:3] ## Upsample the stream batch_size, h, w, _ = net.get_shape().as_list() upscore = tf.nn.conv2d_transpose( net, upscore_weights_tensor, [batch_size, h * 2, w * 2, self.n_classes], [1, 2, 2, 1], reuse=reuse) ## Crop pool4_score to be same shape as upscore upscore_pool4_crop = tf.image.resize_image_with_crop_or_pad( upscore, pool4_h, pool4_h) ## Order invariant combination upscore = pool4_score + upscore_pool4_crop ## Final upsample h, w = upscore.get_shape().as_list()[1:3] upscore = tf.nn.conv2d_transpose( upscore, output_weights_tensor, [batch_size, h * 16, w * 16, self.n_classes], [1, 16, 16, 1], reuse=reuse) ## 4 * 16 = 64 ## Force to be the same size as input upscore = tf.image.resize_image_with_crop_or_pad( upscore, self.x_dim, self.y_dim) print 'fcn16s upscore', upscore.get_shape() return upscore
def fcn32s(self, net, reuse=False): with tf.variable_scope('fcn32s') as scope: if reuse: tf.get_variable_scope().reuse_variables() upsample_filter_np = bilinear_upsample_weights(32, self.n_classes) upsample_filter_tensor = tf.constant(upsample_filter_np) batch_size, h, w, channels = net.get_shape().as_list() output = tf.nn.conv2d_transpose( net, upsample_filter_tensor, [batch_size, h * 32, w * 32, channels], [1, 32, 32, 1], reuse=reuse) output = tf.image.resize_image_with_crop_or_pad( output, self.x_dim, self.y_dim) print 'fcn32s output', output.get_shape() return output
def upsample_tf(factor, input_img): number_of_classes = input_img.shape[2] new_height = input_img.shape[0] * factor new_width = input_img.shape[1] * factor expanded_img = np.expand_dims(input_img, axis=0).astype(np.float32) with tf.Graph().as_default(): with tf.Session() as sess: upsample_filter_np = utile.bilinear_upsample_weights( factor, number_of_classes) res = tf.nn.conv2d_transpose( expanded_img, upsample_filter_np, output_shape=[1, new_height, new_width, number_of_classes], strides=[1, factor, factor, 1]) res = sess.run(res) return np.squeeze(res)
def upsample(net, nf=32, upsample_factor=2): upsample_filter_np = upsampling.bilinear_upsample_weights( upsample_factor, nf) upsample_filter_tensor = tf.constant(upsample_filter_np) downsampled_logits_shape = net.get_shape() # Calculate the ouput size of the upsampled tensor upsampled_logits_shape = [ downsampled_logits_shape[0], downsampled_logits_shape[1] * upsample_factor, downsampled_logits_shape[2] * upsample_factor, downsampled_logits_shape[3] ] print upsampled_logits_shape # Perform the upsampling net = tf.nn.conv2d_transpose( net, upsample_filter_tensor, output_shape=upsampled_logits_shape, strides=[1, upsample_factor, upsample_factor, 1]) return net
def FCN_8s(image_batch_tensor, number_of_classes, new_number_of_classes, is_training, is_reuse, is_fine_tune=False): """Returns the FCN-8s model definition. The function returns the model definition of a network that was described in 'Fully Convolutional Networks for Semantic Segmentation' by Long et al. The network subsamples the input by a factor of 32 and uses three bilinear upsampling layers to upsample prediction by a factor of 32. This means that if the image size is not of the factor 32, the prediction of different size will be delivered. To adapt the network for an any size input use adapt_network_for_any_size_input(FCN_8s, 32). Note: the upsampling kernel is fixed in this model definition, because it didn't give significant improvements according to aforementioned paper. Parameters ---------- image_batch_tensor : [batch_size, height, width, depth] Tensor Tensor specifying input image batch number_of_classes : int An argument specifying the number of classes to be predicted. For example, for PASCAL VOC it is 21. is_training : boolean An argument specifying if the network is being evaluated or trained. It affects the work of underlying dropout layer of VGG-16. Returns ------- upsampled_logits : [batch_size, height, width, number_of_classes] Tensor Tensor with logits representing predictions for each class. Be careful, the output can be of different size compared to input, use adapt_network_for_any_size_input to adapt network for any input size. Otherwise, the input images sizes should be of multiple 32. fcn_16s_variables_mapping : dict {string: variable} Dict which maps the FCN-8s model's variables to FCN-16s checkpoint variables names. We need this to initilize the weights of FCN-8s model with FCN-16s from checkpoint file. Look at ipython notebook for examples. """ # Convert image to float32 before subtracting the # mean pixel value image_batch_float = tf.to_float(image_batch_tensor) # Subtract the mean pixel value from each pixel mean_centered_image_batch = image_batch_float - [_R_MEAN, _G_MEAN, _B_MEAN] upsample_filter_factor_2_np = bilinear_upsample_weights(factor=2, number_of_classes=number_of_classes) upsample_filter_factor_8_np = bilinear_upsample_weights(factor=8, number_of_classes=number_of_classes) upsample_filter_factor_2_tensor = tf.constant(upsample_filter_factor_2_np) upsample_filter_factor_8_tensor = tf.constant(upsample_filter_factor_8_np) with tf.variable_scope('FCN_slice', reuse = is_reuse): with tf.variable_scope("fcn_8s") as fcn_8s_scope: # Define the model that we want to use -- specify to use only two classes at the last layer # TODO: make pull request to get this custom vgg feature accepted # to avoid using custom slim repo. with slim.arg_scope(vgg.vgg_arg_scope()): ## Original FCN-32s model definition last_layer_logits, end_points = vgg.vgg_16(mean_centered_image_batch, num_classes=number_of_classes, is_training=is_training, dropout_keep_prob=0.5, spatial_squeeze=False, use_dilated=False, scope='vgg_16', fc_conv_padding='SAME', global_pool=False) last_layer_logits_shape = tf.shape(last_layer_logits) # Calculate the ouput size of the upsampled tensor last_layer_upsampled_by_factor_2_logits_shape = tf.stack([ last_layer_logits_shape[0], last_layer_logits_shape[1] * 2, last_layer_logits_shape[2] * 2, last_layer_logits_shape[3] ]) last_layer_logits = slim.batch_norm(last_layer_logits, activation_fn=tf.nn.relu) # Perform the upsampling last_layer_upsampled_by_factor_2_logits = tf.nn.conv2d_transpose(last_layer_logits, upsample_filter_factor_2_tensor, output_shape=last_layer_upsampled_by_factor_2_logits_shape, strides=[1, 2, 2, 1], name='upscore2') ## Adding the skip here for FCN-16s model # We created vgg in the fcn_8s name scope -- so # all the vgg endpoints now are prepended with fcn_8s name pool4_features = end_points['FCN_slice/fcn_8s/vgg_16/pool4'] # We zero initialize the weights to start training with the same # accuracy that we ended training FCN-32s pool4_features = slim.batch_norm(pool4_features, activation_fn=tf.nn.relu) pool4_logits = slim.conv2d(pool4_features, number_of_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=tf.zeros_initializer, scope='pool4_fc') fused_last_layer_and_pool4_logits = pool4_logits + last_layer_upsampled_by_factor_2_logits fused_last_layer_and_pool4_logits_shape = tf.shape(fused_last_layer_and_pool4_logits) # Calculate the ouput size of the upsampled tensor fused_last_layer_and_pool4_upsampled_by_factor_2_logits_shape = tf.stack([ fused_last_layer_and_pool4_logits_shape[0], fused_last_layer_and_pool4_logits_shape[1] * 2, fused_last_layer_and_pool4_logits_shape[2] * 2, fused_last_layer_and_pool4_logits_shape[3] ]) fused_last_layer_and_pool4_logits = slim.batch_norm(fused_last_layer_and_pool4_logits, activation_fn=tf.nn.relu) # Perform the upsampling fused_last_layer_and_pool4_upsampled_by_factor_2_logits = tf.nn.conv2d_transpose(fused_last_layer_and_pool4_logits, upsample_filter_factor_2_tensor, output_shape=fused_last_layer_and_pool4_upsampled_by_factor_2_logits_shape, strides=[1, 2, 2, 1], name='upscore4') ## Adding the skip here for FCN-8s model pool3_features = end_points['FCN_slice/fcn_8s/vgg_16/pool3'] # We zero initialize the weights to start training with the same # accuracy that we ended training FCN-32s pool3_features = slim.batch_norm(pool3_features, activation_fn=tf.nn.relu) pool3_logits = slim.conv2d(pool3_features, number_of_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=tf.zeros_initializer, scope='pool3_fc') fused_last_layer_and_pool4_logits_and_pool_3_logits = pool3_logits + \ fused_last_layer_and_pool4_upsampled_by_factor_2_logits fused_last_layer_and_pool4_logits_and_pool_3_logits_shape = tf.shape(fused_last_layer_and_pool4_logits_and_pool_3_logits) # Calculate the ouput size of the upsampled tensor fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits_shape = tf.stack([ fused_last_layer_and_pool4_logits_and_pool_3_logits_shape[0], fused_last_layer_and_pool4_logits_and_pool_3_logits_shape[1] * 8, fused_last_layer_and_pool4_logits_and_pool_3_logits_shape[2] * 8, fused_last_layer_and_pool4_logits_and_pool_3_logits_shape[3] ]) fused_last_layer_and_pool4_logits_and_pool_3_logits = slim.batch_norm(fused_last_layer_and_pool4_logits_and_pool_3_logits, activation_fn=tf.nn.relu) # Perform the upsampling fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits = tf.nn.conv2d_transpose(fused_last_layer_and_pool4_logits_and_pool_3_logits, upsample_filter_factor_8_tensor, output_shape=fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits_shape, strides=[1, 8, 8, 1], name='upscore32') fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits = slim.batch_norm(fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits, activation_fn=tf.nn.relu) fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits_3 = \ _score_layer(wd, fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits, number_of_classes, 1, 'fc_9_1', new_number_of_classes) fcn_8s_variables_mapping = {} fcn_8s_variables = slim.get_variables(fcn_8s_scope) for variable in fcn_8s_variables: # We only need FCN-16s variables to resture from checkpoint # Variables of FCN-8s should be initialized if not is_fine_tune: if 'fc_9_1' in variable.name: continue # Here we remove the part of a name of the variable # that is responsible for the current variable scope original_fcn_8s_checkpoint_string = 'FCN_slice/fcn_8s' + variable.name[len(fcn_8s_scope.name):-2] fcn_8s_variables_mapping[original_fcn_8s_checkpoint_string] = variable return fused_last_layer_and_pool4_logits_and_pool_3_upsampled_by_factor_8_logits_3, fcn_8s_variables_mapping
def fcn_32s(inputs, num_classes, is_training, dropout_keep_prob=0.5): with tf.variable_scope('fcn_32s'): # Based on the structure of vgg-16 network with tf.variable_scope('vgg_16'): # Default settings for conv layers and fc layers with slim.arg_scope( [slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( 0, 0.01), biases_initializer=tf.zeros_initializer, weights_regularizer=slim.l2_regularizer(0.0005)): # Default settings for conv layer with slim.arg_scope([slim.conv2d], padding='SAME'): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Change from fc layer to conv layer net = slim.conv2d(net, 4096, [7, 7], scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Upsampling layer to recover the size of input image # Based on bilinear interpolation upsampling upsample_filter = upsampling.bilinear_upsample_weights( 32, num_classes) upsample_filter_tensor = tf.constant(upsample_filter) shape = tf.shape(net) output = tf.nn.conv2d_transpose(net, upsample_filter_tensor, output_shape=tf.stack([ shape[0], shape[1] * 32, shape[2] * 32, shape[3] ]), strides=[1, 32, 32, 1]) variables = slim.get_variables('fcn_32s') # Extract variables that are the same as original vgg-16 # They could be intialized with pre-trained vgg-16 parameters vgg_variables = {} for variable in variables: vgg_variables[variable.name[8:-2]] = variable return output, vgg_variables
def fcn_16s(inputs, num_classes, is_training, dropout_keep_prob=0.5): with tf.variable_scope('fcn_16s'): # Based on the structure of vgg-16 network with tf.variable_scope('vgg_16'): end_points_collection = 'vgg_16' + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope( [slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): # Default settings for conv layers and fc layers with slim.arg_scope( [slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( 0, 0.01), biases_initializer=tf.zeros_initializer, weights_regularizer=slim.l2_regularizer(0.0005)): # Default settings for conv layer with slim.arg_scope([slim.conv2d], padding='SAME'): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Change from fc layer to conv layer net = slim.conv2d(net, 4096, [7, 7], scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') output = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') end_points = slim.utils.convert_collection_to_dict( end_points_collection) # Upsampling layer to recover the size of input image # Based on bilinear interpolation upsampling output_pool4 = end_points['fcn_16s/vgg_16/pool4'] output_pool4 = slim.conv2d(output_pool4, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='pool4_fc') upsample_filter_2 = upsampling.bilinear_upsample_weights( 2, num_classes) upsample_filter_tensor_2 = tf.constant(upsample_filter_2) shape_output = tf.shape(output) output = tf.nn.conv2d_transpose(output, upsample_filter_tensor_2, output_shape=tf.stack([ shape_output[0], shape_output[1] * 2, shape_output[2] * 2, shape_output[3] ]), strides=[1, 2, 2, 1]) output_combined = output + output_pool4 upsample_filter_16 = upsampling.bilinear_upsample_weights( 16, num_classes) upsample_filter_tensor_16 = tf.constant(upsample_filter_16) shape_output_pool4 = tf.shape(output_pool4) output_combined = tf.nn.conv2d_transpose( output_combined, upsample_filter_tensor_16, output_shape=tf.stack([ shape_output_pool4[0], shape_output_pool4[1] * 16, shape_output_pool4[2] * 16, shape_output_pool4[3] ]), strides=[1, 16, 16, 1]) variables = slim.get_variables('fcn_16s') # Extract variables that are the same as fcn_32s model # They could be intialized with pre-trained fcn_32s parameters fcn_32s_variables = {} for variable in variables: if 'pool4_fc' not in variable.name: fcn_32s_variables[variable.name[0:4] + '32' + variable.name[6:-2]] = variable return output_combined, fcn_32s_variables
def FCN_32s(image_batch_tensor, number_of_classes, is_training, reuse=None): """Returns the FCN-32s model definition. The function returns the model definition of a network that was described in 'Fully Convolutional Networks for Semantic Segmentation' by Long et al. The network subsamples the input by a factor of 32 and uses the bilinear upsampling kernel to upsample prediction by a factor of 32. This means that if the image size is not of the factor 32, the prediction of different size will be delivered. To adapt the network for an any size input use adapt_network_for_any_size_input(FCN_32s, 32). Note: the upsampling kernel is fixed in this model definition, because it didn't give significant improvements according to aforementioned paper. Parameters ---------- image_batch_tensor : [batch_size, height, width, depth] Tensor Tensor specifying input image batch number_of_classes : int An argument specifying the number of classes to be predicted. For example, for PASCAL VOC it is 21. is_training : boolean An argument specifying if the network is being evaluated or trained. It affects the work of underlying dropout layer of VGG-16. Returns ------- upsampled_logits : [batch_size, height, width, number_of_classes] Tensor Tensor with logits representing predictions for each class. Be careful, the output can be of different size compared to input, use adapt_network_for_any_size_input to adapt network for any input size. Otherwise, the input images sizes should be of multiple 32. vgg_16_variables_mapping : dict {string: variable} Dict which maps the FCN-32s model's variables to VGG-16 checkpoint variables names. We need this to initilize the weights of FCN-32s model with VGG-16 from checkpoint file. Look at ipython notebook for examples. """ with tf.variable_scope("fcn_32s", reuse=reuse) as fcn_32s_scope: #with tf.variable_scope("fcn_32s") as fcn_32s_scope: upsample_factor = 32 # Convert image to float32 before subtracting the # mean pixel value image_batch_float = tf.to_float(image_batch_tensor) # Subtract the mean pixel value from each pixel mean_centered_image_batch = image_batch_float - [ _R_MEAN, _G_MEAN, _B_MEAN ] upsample_filter_np = bilinear_upsample_weights(upsample_factor, number_of_classes) upsample_filter_tensor = tf.constant(upsample_filter_np) # TODO: make pull request to get this custom vgg feature accepted # to avoid using custom slim repo. with slim.arg_scope(vgg.vgg_arg_scope()): logits, end_points = vgg.vgg_16(mean_centered_image_batch, num_classes=number_of_classes, is_training=is_training, spatial_squeeze=False, fc_conv_padding='SAME') downsampled_logits_shape = tf.shape(logits) # Calculate the ouput size of the upsampled tensor #upsampled_logits_shape = tf.pack([ # downsampled_logits_shape[0], # downsampled_logits_shape[1] * upsample_factor, # downsampled_logits_shape[2] * upsample_factor, # downsampled_logits_shape[3] # ]) upsampled_logits_shape = tf.stack([ downsampled_logits_shape[0], downsampled_logits_shape[1] * upsample_factor, downsampled_logits_shape[2] * upsample_factor, downsampled_logits_shape[3] ]) # Perform the upsampling upsampled_logits = tf.nn.conv2d_transpose( logits, upsample_filter_tensor, output_shape=upsampled_logits_shape, strides=[1, upsample_factor, upsample_factor, 1]) # Map the original vgg-16 variable names # to the variables in our model. This is done # to make it possible to use assign_from_checkpoint_fn() # while providing this mapping. # TODO: make it cleaner vgg_16_variables_mapping = {} vgg_16_variables = slim.get_variables(fcn_32s_scope) for variable in vgg_16_variables: # Here we remove the part of a name of the variable # that is responsible for the current variable scope # original_vgg_16_checkpoint_string = variable.name[len(fcn_32s_scope.original_name_scope):-2] # Updated: changed .name_scope to .name because name_scope only affects operations # and variable scope is actually represented by .name original_vgg_16_checkpoint_string = variable.name[ len(fcn_32s_scope.name) + 1:-2] vgg_16_variables_mapping[ original_vgg_16_checkpoint_string] = variable return upsampled_logits, vgg_16_variables_mapping
def fcn_net(self, x, end_points): def _shape(tensor): shape = tf.shape(tensor) return shape[0], shape[1], shape[2], shape[3] n_classes = self.n_classes upsample_filter_2 = tf.constant( bilinear_upsample_weights(factor=2, n_classes=n_classes)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, weights_initializer=tf.zeros_initializer): # first upsampling x2 n, h, w, c = _shape(x) output_shape = tf.stack([n, h * 2, w * 2, c]) # 2x conv7 net = tf.nn.conv2d_transpose(x, upsample_filter_2, output_shape=output_shape, strides=[1, 2, 2, 1]) # pool4 pool4_features = end_points['vgg_16/pool4'] pool4_logits = slim.conv2d(pool4_features, n_classes, [1, 1], scope='pool4_fc') # pool4 + 2x conv7 net = pool4_logits + net # second upsampling x2 n, h, w, c = _shape(net) output_shape = tf.stack([n, h * 2, w * 2, c]) # 2x (pool4 + 2x conv7) net = tf.nn.conv2d_transpose(net, upsample_filter_2, output_shape=output_shape, strides=[1, 2, 2, 1]) # pool3 pool3_features = end_points['vgg_16/pool3'] pool3_logits = slim.conv2d(pool3_features, n_classes, [1, 1], scope='pool3_fc') # pool3 + 2x pool4 + 4x conv7 net = pool3_logits + net # third upsampling x2 n, h, w, c = _shape(net) output_shape = tf.stack([n, h * 2, w * 2, c]) # 2x (pool3 + 2x pool4 + 4x conv7) net = tf.nn.conv2d_transpose(net, upsample_filter_2, output_shape=output_shape, strides=[1, 2, 2, 1]) # pool2 pool2_features = end_points['vgg_16/pool2'] pool2_logits = slim.conv2d(pool2_features, n_classes, [1, 1], scope='pool2_fc') # pool2 + 2x pool3 + 4x pool4 + 8x pool7 net = pool2_logits + net # fourth upsampling x2 n, h, w, c = _shape(net) output_shape = tf.stack([n, h * 2, w * 2, c]) # 2x (2x pool3 + 4x pool4 + 8x pool7) net = tf.nn.conv2d_transpose(net, upsample_filter_2, output_shape=output_shape, strides=[1, 2, 2, 1]) # pool1 pool1_features = end_points['vgg_16/pool1'] pool1_logits = slim.conv2d(pool1_features, n_classes, [1, 1], scope='pool1_fc') # pool1 + 2x pool2 + 4x pool3 + 8x pool4 + 16x pool7 net = pool1_logits + net # final upsampling x2 n, h, w, c = _shape(net) output_shape = tf.stack([n, h * 2, w * 2, c]) net = tf.nn.conv2d_transpose(net, upsample_filter_2, output_shape=output_shape, strides=[1, 2, 2, 1]) return net
def fcn_16s(image_batch_tensor, num_classes, is_training): # Get the filters for upsampling by factor 2 and 16 upsample_by_2_weights = bilinear_upsample_weights( factor=2, number_of_classes=num_classes) upsample_by_2_filter = tf.constant(upsample_by_2_weights) upsample_by_16_weights = bilinear_upsample_weights( factor=16, number_of_classes=num_classes) upsample_by_16_filter = tf.constant(upsample_by_16_weights) image_batch_float = tf.to_float(image_batch_tensor) # Create a variable scope for our model with tf.variable_scope('fcn_16s') as fcn_16s_scope: # arg_scope defines the default functions for layer variables such as initailzers with slim.arg_scope(vgg.vgg_arg_scope()): # tensorflow slim vgg_16 signature: # def vgg_16(inputs, num_classes=1000, is_training=True, # dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_16'): # Need to use 'same' padding for convolutional layers in vgg vgg_logits, vgg_endpoints = vgg.vgg_16(image_batch_float, num_classes=num_classes, is_training=is_training, spatial_squeeze=False, fc_conv_padding='SAME') vgg_layer_shape = tf.shape(vgg_logits) # Calculate the size of the tensor upsampled by two times # vgg_layer_shape[0] is the batch size # vgg_layer_shape[1] is the height and vgg_layer_shape[2] is the width upsample_by_2_shape = tf.stack([ vgg_layer_shape[0], vgg_layer_shape[1] * 2, vgg_layer_shape[2] * 2, vgg_layer_shape[3] ]) # Perform upsampling using transpose convolution # conv2d_transpose input: # tf.nn.conv2d_transpose(value, filter, output_shape, strides, # padding='SAME', data_format='NHWC', name=None) upsample_by_2_logits = tf.nn.conv2d_transpose( vgg_logits, upsample_by_2_filter, upsample_by_2_shape, [1, 2, 2, 1]) # Now we add the skip layer from pool4 layer of vgg pool4_features = vgg_endpoints['fcn_16s/vgg_16/pool4'] # The pool4 output, according to paper, needs to go through a # convolutional layer before being combined with the FCN32 logits pool4_logits = slim.conv2d( pool4_features, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=tf.zeros_initializer(), scope='pool4_fc') vgg_pool4_combined_logits = upsample_by_2_logits + pool4_logits vgg_pool4_combined_shape = tf.shape(vgg_pool4_combined_logits) # Now upsample the combined logits by a factor of 16 upsample_by_16_shape = tf.stack([ vgg_pool4_combined_shape[0], vgg_pool4_combined_shape[1] * 16, vgg_pool4_combined_shape[2] * 16, vgg_pool4_combined_shape[3] ]) upsample_by_16_logits = tf.nn.conv2d_transpose( vgg_pool4_combined_logits, upsample_by_16_filter, upsample_by_16_shape, [1, 16, 16, 1]) # We need this mapping to load pretrained vgg model vgg_16_variables_mapping = {} fcn_16s_variables = slim.get_variables(fcn_16s_scope) for variable in fcn_16s_variables: # We only need FCN-32s variables to resture from checkpoint # Variables of FCN-16s should be initialized if 'pool4_fc' in variable.name: continue # Here we remove the part of a name of the variable # that is responsible for the current variable scope original_vgg_16_checkpoint_string = variable.name[ len(fcn_16s_scope.original_name_scope):-2] vgg_16_variables_mapping[ original_vgg_16_checkpoint_string] = variable return upsample_by_16_logits, vgg_16_variables_mapping