def __init__(self, num_classes, train_dir='/tmp/model/train', summary_log_freq=100): """A semantic segmentation model based on 3D UNet sparse voxel network. Args: num_classes: A int indicating the number of semantic classes to predict logits. train_dir: A directory path to write tensorboard summary for losses. summary_log_freq: A int of the frequency (as batches) to log summary. Returns: A dictionary containing a predicted tensor per task. The predicted tensors are of size [batch_size, num_voxels, num_task_channels]. """ super().__init__(loss_names_to_functions={ 'semantic_loss': classification_losses.classification_loss }, loss_names_to_weights={'semantic_loss': 1.0}, train_dir=train_dir, summary_log_freq=summary_log_freq) task_names_to_num_output_channels = { standard_fields.DetectionResultFields.object_semantic_voxels: num_classes } self.num_classes = num_classes self.sparse_conv_unet = sparse_voxel_unet.SparseConvUNet( task_names_to_num_output_channels=task_names_to_num_output_channels )
def test_sparse_voxel_unet(self): basenet = sparse_voxel_unet.SparseConvUNet( task_names_to_num_output_channels={'feature': 64}) voxel_features = tf.constant( [[[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 2.0, 3.0]]], dtype=tf.float32) voxel_xyz_indices = tf.constant( [[[0, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]], dtype=tf.int32) num_valid_voxels = tf.constant([4], dtype=tf.int32) outputs = basenet([voxel_features, voxel_xyz_indices, num_valid_voxels], training=True) self.assertAllEqual(outputs['feature'].shape, [1, 4, 64])
def __init__(self, num_stacked_networks=1, task_names_to_num_output_channels=None, task_names_to_use_relu_last_conv=None, task_names_to_use_batch_norm_in_last_layer=None, conv_filter_size=3, encoder_dimensions=((32, 64), (64, 128), (128, 256)), bottleneck_dimensions=(256, 256), decoder_dimensions=((256, 256), (128, 128), (64, 64)), dropout_prob=0.0, use_batch_norm=True, network_pooling_segment_func=tf.math.unsorted_segment_max, normalize_sparse_conv=True): """3D UNet sparse voxel network. Args: num_stacked_networks: Number of stacked networks that build the hour-glass structure. task_names_to_num_output_channels: A dictionary containing the mapping between task names to number of prediction channels for each task. task_names_to_use_relu_last_conv: A dictionary containing the mapping between task names to whether relu should be applied at the last convolution or not. If None, by default relu will not be applied. task_names_to_use_batch_norm_in_last_layer: A dictionary containing the mapping between task names to whether batch norm is applied to the last convolution of the tasks. conv_filter_size: The 3d convolution filter size. Currently the 3d convolution op is optimized for a filter size of 3. encoder_dimensions: A tuple of tuples, where each nested tuple is a list of ints describing the output feature dimensionality of each 3x3x3 convolution. After every nested tuple we do a 2x2x2 3D Max Pooling. bottleneck_dimensions: A tuple of ints describing the output feature dimensionality of each 3x3x3 convolution in the middle of the network, which is after we have finished downsampling but before upsampling. decoder_dimensions: A tuple of tuples, where each nested tuple is a list of ints describing the output feature dimensionality of each 3x3x3 convolution. Before every new nested tuple we do a 2x2x2 upsampling operation, and then concatenate encoder features in a UNet fashion. dropout_prob: A float indicating the probability of dropout. use_batch_norm: Whether to use batch normalization or not. network_pooling_segment_func: Function used to pool voxel features in the network. normalize_sparse_conv: If True, applies normalization to 3d sparse convs. Returns: A dictionary containing a predicted tensor per task. The predicted tensors are of size [batch_size, num_voxels, num_task_channels]. Raises: ValueError: If task_names_to_num_output_channels is None. ValueError: If the encoder and decoder have a different number of downsampling/upsampling levels. """ super().__init__() if task_names_to_num_output_channels is None: raise ValueError( 'task_names_to_num_output_channels cannot be None') if len(encoder_dimensions) != len(decoder_dimensions): raise ValueError( 'The number of encoder and decoder blocks should be equal') if task_names_to_use_relu_last_conv is None: task_names_to_use_relu_last_conv = {} for key in sorted(task_names_to_num_output_channels): task_names_to_use_relu_last_conv[key] = False if task_names_to_use_batch_norm_in_last_layer is None: task_names_to_use_batch_norm_in_last_layer = {} for key in sorted(task_names_to_num_output_channels): task_names_to_use_batch_norm_in_last_layer[key] = False self.num_stacked_networks = num_stacked_networks self.input_spec = [ tf.keras.layers.InputSpec(shape=(None, None, None), dtype=tf.float32), tf.keras.layers.InputSpec(shape=(None, None, 3), dtype=tf.int32), tf.keras.layers.InputSpec(shape=(None, ), dtype=tf.int32) ] self.networks = [] decoder_dimensions_last = decoder_dimensions[-1][-1] for i in range(num_stacked_networks): if i == num_stacked_networks - 1: task_channels = task_names_to_num_output_channels task_relu = task_names_to_use_relu_last_conv task_batch_norm = task_names_to_use_batch_norm_in_last_layer else: task_channels = { 'intermediate_output': decoder_dimensions_last } task_relu = {'intermediate_output': True} task_batch_norm = {'intermediate_output': use_batch_norm} self.networks.append( sparse_voxel_unet.SparseConvUNet( task_names_to_num_output_channels=task_channels, task_names_to_use_relu_last_conv=task_relu, task_names_to_use_batch_norm_in_last_layer=task_batch_norm, conv_filter_size=conv_filter_size, encoder_dimensions=encoder_dimensions, bottleneck_dimensions=bottleneck_dimensions, decoder_dimensions=decoder_dimensions, dropout_prob=dropout_prob, use_batch_norm=use_batch_norm, network_pooling_segment_func=network_pooling_segment_func, normalize_sparse_conv=normalize_sparse_conv))
def __init__(self, num_classes, loss_names_to_functions=None, loss_names_to_weights=None, embedding_dims=64, embedding_similarity_strategy='distance', embedding_similarity_threshold=0.5, apply_nms=True, nms_score_threshold=0.1, nms_iou_threshold=0.3, num_furthest_voxel_samples=1000, sampler_score_vs_distance_coef=0.5, train_dir='/tmp/model/train', summary_log_freq=100): """An object detection model based on 3D UNet sparse voxel network. Args: num_classes: A int indicating the number of semantic classes to predict logits. loss_names_to_functions: A dictionary mapping loss names to loss functions. loss_names_to_weights: A dictionary mapping loss names to loss weights. embedding_dims: An integer determining per voxels embeddings with the specified dimensionality are added to the outputs dictionary. embedding_similarity_strategy: Defines the method for computing similarity between embedding vectors. Possible values are 'dotproduct' and 'distance'. embedding_similarity_threshold: Similarity threshold used to decide if two point embedding vectors belong to the same instance. apply_nms: If True, it will apply non-maximum suppression to the final predictions. nms_score_threshold: Score threshold used in non-maximum suppression. nms_iou_threshold: Intersection over union threshold used in non-maximum suppression. num_furthest_voxel_samples: Number of voxels to be sampled using furthest voxel sampling in the postprocessor. sampler_score_vs_distance_coef: The coefficient that balances the weight between furthest voxel sampling and highest score sampling in the postprocessor. train_dir: A directory path to write tensorboard summary for losses. summary_log_freq: A int of the frequency (as batches) to log summary. Returns: A dictionary containing tensors that contain predicted object properties. """ super().__init__(loss_names_to_functions=loss_names_to_functions, loss_names_to_weights=loss_names_to_weights, train_dir=train_dir, summary_log_freq=summary_log_freq) self.num_classes = num_classes self.embedding_dims = embedding_dims self.embedding_similarity_strategy = embedding_similarity_strategy self.embedding_similarity_threshold = embedding_similarity_threshold self.apply_nms = apply_nms self.nms_score_threshold = nms_score_threshold self.nms_iou_threshold = nms_iou_threshold self.num_furthest_voxel_samples = num_furthest_voxel_samples self.sampler_score_vs_distance_coef = sampler_score_vs_distance_coef task_names_to_num_output_channels = { standard_fields.DetectionResultFields.object_semantic_voxels: num_classes, standard_fields.DetectionResultFields.instance_embedding_voxels: embedding_dims, } task_names_to_use_relu_last_conv = { standard_fields.DetectionResultFields.object_semantic_voxels: False, standard_fields.DetectionResultFields.instance_embedding_voxels: False, } task_names_to_use_batch_norm_in_last_layer = {} for key in task_names_to_num_output_channels: task_names_to_use_batch_norm_in_last_layer[key] = False self.sparse_conv_unet = sparse_voxel_unet.SparseConvUNet( task_names_to_num_output_channels=task_names_to_num_output_channels, task_names_to_use_relu_last_conv=task_names_to_use_relu_last_conv, task_names_to_use_batch_norm_in_last_layer=( task_names_to_use_batch_norm_in_last_layer))