def _add_convs(self, input_tensor, channels, tower_idx): """ Adds the convolution layers. Adds a series of convolution layers with ReLU nonlinearity and pooling after each of them. :param input_tensor: a 4D tensor as the input to the first Conv layer :param channels: a list of channel sizes for input_tensor and following conv layers. Number of channels in input tensor should be equal to channels[0] :param tower_idx: the index number for this tower. Each tower is named as tower_{tower_idx} and resides on 'gpu:{tower_idx}' :return: a 4D tensor as the output of the last pooling layer """ for i in range(1, len(channels)): with tf.variable_scope('conv{}'.format(i)) as scope: kernel = variables.weight_variable( shape=[5, 5, channels[i - 1], channels[i]], stddev=5e-2, verbose=self._hparams.verbose) conv = tf.nn.conv2d( input_tensor, kernel, [1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') biases = variables.bias_variable([channels[i]], verbose=self._hparams.verbose) pre_activation = tf.nn.bias_add( conv, biases, data_format='NCHW', name='logits') relu = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram('activation', relu) input_tensor = tf.contrib.layers.max_pool2d( relu, kernel_size=2, stride=2, data_format='NCHW', padding='SAME') return input_tensor
def _add_convs(self, input_tensor, channels): """Adds the convolution layers. Adds a series of convolution layers with ReLU nonlinearity and pooling after each of them. Args: input_tensor: a 4D float tensor as the input to the first convolution. channels: A list of channel sizes for input_tensor and following convolution layers. Number of channels in input tensor should be equal to channels[0]. Returns: A 4D tensor as the output of the last pooling layer. """ for i in xrange(1, len(channels)): with tf.variable_scope('conv{}'.format(i)) as scope: kernel = variables.weight_variable( shape=[5, 5, channels[i - 1], channels[i]], stddev=5e-2, verbose=self._hparams.verbose ) conv = tf.nn.conv2d( input_tensor, kernel, [1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') biases = variables.bias_variable([channels[i]], verbose=self._hparams.verbose) pre_activation = tf.nn.bias_add(conv, biases, data_format='NCHW') relu = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram('activation', relu) input_tensor = tf.contrib.layers.max_pool2d( relu, kernel_size=2, stride=2, data_format='NCHW', padding='SAME') return input_tensor
def capsule(input_tensor, input_dim, output_dim, layer_name, input_atoms=8, output_atoms=8, **routing_args): """Builds a fully connected capsule layer. Given an input tensor of shape `[batch, input_dim, input_atoms]`, this op performs the following: 1. For each input capsule, multiples it with the weight variable to get votes of shape `[batch, input_dim, output_dim, output_atoms]`. 2. Scales the votes for each output capsule by iterative routing. 3. Squashes the output of each capsule to have norm less than one. Each capsule of this layer has one weight tensor for each capsules of layer below. Therefore, this layer has the following number of trainable variables: w: [input_dim * num_in_atoms, output_dim * num_out_atoms] b: [output_dim * num_out_atoms] Args: input_tensor: tensor, activation output of the layer below. input_dim: scalar, number of capsules in the layer below. output_dim: scalar, number of capsules in this layer. layer_name: string, Name of this layer. input_atoms: scalar, number of units in each capsule of input layer. output_atoms: scalar, number of units in each capsule of output layer. **routing_args: dictionary {leaky, num_routing}, args for routing function. Returns: Tensor of activations for this layer of shape `[batch, output_dim, output_atoms]`. """ with tf.variable_scope(layer_name): # weights variable will hold the state of the weights for the layer weights = variables.weight_variable( [input_dim, input_atoms, output_dim * output_atoms]) biases = variables.bias_variable([output_dim, output_atoms]) with tf.name_scope('Wx_plus_b'): # Depthwise matmul: [b, d, c] ** [d, c, o_c] = [b, d, o_c] # To do this: tile input, do element-wise multiplication and reduce # sum over input_atoms dimmension. input_tiled = tf.tile(tf.expand_dims(input_tensor, -1), [1, 1, 1, output_dim * output_atoms]) votes = tf.reduce_sum(input_tiled * weights, axis=2) votes_reshaped = tf.reshape( votes, [-1, input_dim, output_dim, output_atoms]) with tf.name_scope('routing'): input_shape = tf.shape(input_tensor) logit_shape = tf.stack([input_shape[0], input_dim, output_dim]) activations = _update_routing(votes=votes_reshaped, biases=biases, logit_shape=logit_shape, num_dims=4, input_dim=input_dim, output_dim=output_dim, **routing_args) return activations
def capsule(tower_idx, in_tensor, in_dim, in_atoms, out_dim, out_atoms, layer_name, reassemble, **routing_args): """Builds a fully connected capsule layer. Given an input tensor of shape (batch, in_dim, in_atoms), this op performs the following: 1. For each input capsule, multiplies it with the weight variables to get votes of shape (batch, in_dim, out_dim, out_atoms); 2. Scales the votes for each output capsule by routing; 3. Squashes the output of each capsule to have norm less than one. Each capsule of this layer has one weight tensor for each capsule of layer below. Therefore, this layer has the following number of trainable variables: kernel: (in_dim, in_atoms, out_dim * out_atoms) biases: (out_dim, out_atoms) Args: in_tensor: tensor, activation output of the layer below. in_dim: scalar, number of capsule types in the layer below. in_atoms: scalar, number of units of input capsule. out_dim: scalar, number of capsule types in the output layer. out_atoms: scalar, number of units of output capsule. layer_name: string, the number of this layer. **routing_args: dictionary {leaky, num_routing}, args for routing. Returns: Tensor of activations for this layer of shape (batch, out_dim, out_atoms). """ with tf.variable_scope(layer_name): weights = variables.weight_variable( [in_dim, in_atoms, out_dim * out_atoms]) biases = variables.bias_variable([out_dim, out_atoms]) with tf.name_scope('Wx_plus_b'): # Depthwise matmul: [b, d, c] @ [d, c, o_c] = [b, d, o_c] # to do this: tile input, do element-wise multiplication and reduce # sum over in_atoms dimmension. in_tiled = tf.tile(tf.expand_dims(in_tensor, -1), [1, 1, 1, out_dim * out_atoms]) votes = tf.reduce_sum(in_tiled * weights, axis=2) votes_reshaped = tf.reshape(votes, [-1, in_dim, out_dim, out_atoms]) with tf.name_scope('routing'): in_shape = tf.shape(in_tensor) logit_shape = tf.stack([in_shape[0], in_dim, out_dim]) activations = _update_routing(tower_idx, votes=votes_reshaped, biases=biases, logit_shape=logit_shape, num_ranks=4, in_dim=in_dim, out_dim=out_dim, reassemble=reassemble, **routing_args) return activations
def inference(self, features): """Adds the inference graph ops. Builds the architecture of the neural net to drive logits from features. The inference graph includes a series of convolution and fully connected layers and outputs a [batch, 10] tensor as the logits. Args: features: Dictionary of batched feature tensors like images and labels. Returns: A model.Inferred named tuple of expected outputs of the model like 'logits' and 'remakes' for the reconstructions (to be added). """ image = features['images'] image_dim = features['height'] image_depth = features['depth'] image_4d = tf.reshape(image, [-1, image_depth, image_dim, image_dim]) conv = self._add_convs(image_4d, [image_depth, 512, 256]) hidden1 = tf.contrib.layers.flatten(conv) with tf.variable_scope('fc1') as scope: dim = hidden1.get_shape()[1].value weights = variables.weight_variable(shape=[dim, 1024], stddev=0.1, verbose=self._hparams.verbose) biases = variables.bias_variable(shape=[1024], verbose=self._hparams.verbose) pre_activation = tf.matmul(hidden1, weights) + biases hidden2 = tf.nn.relu(pre_activation, name=scope.name) with tf.variable_scope('softmax_layer') as scope: weights = variables.weight_variable( shape=[1024, features['num_classes']], stddev=0.1, verbose=self._hparams.verbose) biases = variables.bias_variable(shape=[features['num_classes']], verbose=self._hparams.verbose) logits = tf.matmul(hidden2, weights) + biases return model.Inferred(logits, None)
def inference(self, features): """Adds the inference graph ops. Builds the architecture of the neural net to drive logits from features. The inference graph includes a convolution layer, a primary capsule layer and a 10-capsule final layer. Optionally, it also adds the reconstruction network on top of the 10-capsule final layer. Args: features: Dictionary of batched feature tensors like images and labels. Returns: A model.Inferred named tuple of expected outputs of the model like 'logits' and 'recons' for the reconstructions. """ image_dim = features['height'] image_depth = features['depth'] image = features['images'] image_4d = tf.reshape(image, [-1, image_depth, image_dim, image_dim]) # ReLU Convolution (conv1 layer start) with tf.variable_scope('conv1') as scope: kernel = variables.weight_variable(shape=[9, 9, image_depth, 256], stddev=5e-2, verbose=self._hparams.verbose) biases = variables.bias_variable([256], verbose=self._hparams.verbose) conv1 = tf.nn.conv2d(image_4d, kernel, [1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') pre_activation = tf.nn.bias_add(conv1, biases, data_format='NCHW') relu1 = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram('activation', relu1) # conv1 laeyr end, return [128, 256, 20, 20] in NCHW format. # Then expand dims to [128, 1, 256, 20, 20] hidden1 = tf.expand_dims(relu1, 1) # Capsules, including primary capsules layer and digit capsules layer. # The final output here is [batch_size, 10, 16] capsule_output = self._build_capsule(hidden1, features['num_classes']) # calculate the length of v using vector norm or 2-norm ||v||_2 # return [batch_size, 10] # equals to the sqrt(reduce_sum(square(v))) logits = tf.norm(capsule_output, axis=-1) # Reconstruction if self._hparams.remake: remake = self._remake(features, capsule_output) else: remake = None return model.Inferred(logits, remake)
def inference(self, features): """Adds the inference graph ops. Builds the architecture of the neural net to drive logits from features. The inference graph includes a series of convolution and fully connected layers and outputs a [batch, 10] tensor as the logits. Args: features: Dictionary of batched feature tensors like images and labels. Returns: A model.Inferred named tuple of expected outputs of the model like 'logits' and 'remakes' for the reconstructions (to be added). """ image = features['images'] image_dim = features['height'] image_depth = features['depth'] image_4d = tf.reshape(image, [-1, image_depth, image_dim, image_dim]) conv = self._add_convs(image_4d, [image_depth, 512, 256]) hidden1 = tf.contrib.layers.flatten(conv) with tf.variable_scope('fc1') as scope: dim = hidden1.get_shape()[1].value weights = variables.weight_variable(shape=[dim, 1024], stddev=0.1, verbose=self._hparams.verbose) biases = variables.bias_variable(shape=[1024], verbose=self._hparams.verbose) pre_activation = tf.matmul(hidden1, weights) + biases hidden2 = tf.nn.relu(pre_activation, name=scope.name) with tf.variable_scope('softmax_layer') as scope: weights = variables.weight_variable( shape=[1024, features['num_classes']], stddev=0.1, verbose=self._hparams.verbose ) biases = variables.bias_variable(shape=[features['num_classes']], verbose=self._hparams.verbose) logits = tf.matmul(hidden2, weights) + biases return model.Inferred(logits, None)
def inference(self, features): """Adds the inference graph ops. Builds the architecture of the neural net to drive logits from features. The inference graph includes a convolution layer, a primary capsule layer and a 10-capsule final layer. Optionally, it also adds the reconstruction network on top of the 10-capsule final layer. Args: features: Dictionary of batched feature tensors like images and labels. Returns: A model.Inferred named tuple of expected outputs of the model like 'logits' and 'recons' for the reconstructions. """ image_height = features['height'] image_width = features['width'] image_depth = features['depth'] image = features['images'] image_4d = tf.reshape(image, [-1, image_depth, image_height, image_width]) # ReLU Convolution with tf.variable_scope('conv1') as scope: kernel = variables.weight_variable(shape=[9, 9, image_depth, 256], stddev=5e-2, verbose=self._hparams.verbose) biases = variables.bias_variable([256], verbose=self._hparams.verbose) conv1 = tf.nn.conv2d(image_4d, kernel, [1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') pre_activation = tf.nn.bias_add(conv1, biases, data_format='NCHW') relu1 = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram('activation', relu1) hidden1 = tf.expand_dims(relu1, 1) # Capsules capsule_output = self._build_capsule(hidden1, features['num_classes']) logits = tf.norm(capsule_output, axis=-1) # Reconstruction if self._hparams.remake: remake = self._remake(features, capsule_output) else: remake = None return model.Inferred(logits, remake)
def testVariableDeclaration(self): """Checks the value and shape of the squidge output given a rank 2 input.""" with tf.Graph().as_default(): with self.test_session() as sess: weights = variables.weight_variable((1, 2), stddev=0.1) bias = variables.bias_variable((1)) sess.run(tf.global_variables_initializer()) w_value, b_value = sess.run([weights, bias]) self.assertNear(w_value[0][0], 0.0, 0.2) self.assertNear(w_value[0][1], 0.0, 0.2) self.assertEqual(b_value, 0.1) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.assertEqual(len(trainable_vars), 2) self.assertStartsWith(trainable_vars[0].name, 'weights') self.assertStartsWith(trainable_vars[1].name, 'biases')
def inference(self, features): """Adds the inference graph ops. Builds the architecture of the neural net to drive logits from features. The inference graph includes a convolution layer, a primary capsule layer and a 10-capsule final layer. Optionally, it also adds the reconstruction network on top of the 10-capsule final layer. Args: features: Dictionary of batched feature tensors like images and labels. Returns: A model.Inferred named tuple of expected outputs of the model like 'logits' and 'recons' for the reconstructions. """ image_dim = features['height'] image_depth = features['depth'] image = features['images'] image_4d = tf.reshape(image, [-1, image_depth, image_dim, image_dim]) # ReLU Convolution with tf.variable_scope('conv1') as scope: kernel = variables.weight_variable( shape=[9, 9, image_depth, 256], stddev=5e-2, verbose=self._hparams.verbose) biases = variables.bias_variable([256], verbose=self._hparams.verbose) conv1 = tf.nn.conv2d( image_4d, kernel, [1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') pre_activation = tf.nn.bias_add(conv1, biases, data_format='NCHW') relu1 = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram('activation', relu1) hidden1 = tf.expand_dims(relu1, 1) # Capsules capsule_output = self._build_capsule(hidden1, features['num_classes']) logits = tf.norm(capsule_output, axis=-1) # Reconstruction if self._hparams.remake: remake = self._remake(features, capsule_output) else: remake = None return model.Inferred(logits, remake)
def testVariableDeclaration(self): """Checks the value and shape of the squidge output given a rank 2 input.""" with tf.Graph().as_default(): with self.test_session() as sess: weights = variables.weight_variable((1, 2), stddev=0.1) bias = variables.bias_variable((1)) sess.run(tf.global_variables_initializer()) w_value, b_value = sess.run([weights, bias]) self.assertNear(w_value[0][0], 0.0, 0.2) self.assertNear(w_value[0][1], 0.0, 0.2) self.assertEqual(b_value, 0.1) trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) self.assertEqual(len(trainable_vars), 2) self.assertStartsWith(trainable_vars[0].name, 'weights') self.assertStartsWith(trainable_vars[1].name, 'biases')
def conv_slim_capsule(input_tensor, input_dim, output_dim, layer_name, input_atoms=8, output_atoms=8, stride=2, kernel_size=5, padding='SAME', **routing_args): """Builds a slim convolutional capsule layer. This layer performs 2D convolution given 5D input tensor of shape `[batch, input_dim, input_atoms, input_height, input_width]`. Then refines the votes with routing and applies Squash non linearity for each capsule. Each capsule in this layer is a convolutional unit and shares its kernel over the position grid and different capsules of layer below. Therefore, number of trainable variables in this layer is: kernel: [kernel_size, kernel_size, input_atoms, output_dim * output_atoms] bias: [output_dim, output_atoms] Output of a conv2d layer is a single capsule with channel number of atoms. Therefore conv_slim_capsule is suitable to be added on top of a conv2d layer with num_routing=1, input_dim=1 and input_atoms=conv_channels. Args: input_tensor: tensor, of rank 5. Last two dimmensions representing height and width position grid. input_dim: scalar, number of capsules in the layer below. output_dim: scalar, number of capsules in this layer. layer_name: string, Name of this layer. input_atoms: scalar, number of units in each capsule of input layer. output_atoms: scalar, number of units in each capsule of output layer. stride: scalar, stride of the convolutional kernel. kernel_size: scalar, convolutional kernels are [kernel_size, kernel_size]. padding: 'SAME' or 'VALID', padding mechanism for convolutional kernels. **routing_args: dictionary {leaky, num_routing}, args to be passed to the update_routing function. Returns: Tensor of activations for this layer of shape `[batch, output_dim, output_atoms, out_height, out_width]`. If padding is 'SAME', out_height = in_height and out_width = in_width. Otherwise, height and width is adjusted with same rules as 'VALID' in tf.nn.conv2d. """ with tf.variable_scope(layer_name): # convolution. return [batch_size, 1, 32, 8, 6, 6] kernel = variables.weight_variable(shape=[ kernel_size, kernel_size, input_atoms, output_dim * output_atoms ]) biases = variables.bias_variable([output_dim, output_atoms, 1, 1]) votes, votes_shape, input_shape = _depthwise_conv3d( input_tensor, kernel, input_dim, output_dim, input_atoms, output_atoms, stride, padding) # convolution End with tf.name_scope('routing'): logit_shape = tf.stack([ input_shape[0], input_dim, output_dim, votes_shape[2], votes_shape[3] ]) biases_replicated = tf.tile(biases, [1, 1, votes_shape[2], votes_shape[3]]) # Do routing algorithm inside primaryCaps layer # What is interesting is that the paper does not mention the routing # here. And the words 'One can see PrimaryCapsules as a Convolution # layer with Eq. 1 as its block non-linearity' and 'no routing is used # between Conv1 and PrimaryCapsules.' in paper are easy to mislead people # to think it as an ordinary convolution layer with squash operation. # Ok, in a word, the code is inconsistent with the paper. activations = _update_routing( votes=votes, biases=biases_replicated, logit_shape=logit_shape, num_dims=6, input_dim=input_dim, output_dim=output_dim, **routing_args) return activations
def build_replica(self, tower_idx): """Adds a replica graph ops. Builds the architecture of the neural net to derive logits from batched_dataset. The inference graph defined here should involve trainable variables otherwise the optimizer will raise a ValueError. Args: tower_idx: the index number for this tower. Each tower is named as tower_{tower_idx} and resides on gpu:{tower_idx}. Returns: Inferred namedtuple containing (logits, None). """ # Image specs image_size = self._specs['image_size'] image_depth = self._specs['depth'] num_classes = self._specs['num_classes'] # Define input_tensor for input batched_images batched_images = tf.placeholder(tf.float32, shape=[None, image_depth, image_size, image_size], name='batched_images') """visual""" tf.add_to_collection('tower_%d_batched_images' % tower_idx, batched_images) # Add convolutional layers conv_out = self._add_convs(batched_images, [image_depth, 512, 256], tower_idx) hidden1 = tf.contrib.layers.flatten(conv_out) # flatten neurons, shape (?, rest) # Add fully connected layer 1, activation = relu with tf.variable_scope('fc1') as scope: dim = hidden1.get_shape()[1].value weights = variables.weight_variable(shape=[dim, 1024], stddev=0.1, verbose=self._hparams.verbose) biases = variables.bias_variable(shape=[1024], verbose=self._hparams.verbose) pre_activation = tf.add(tf.matmul(hidden1, weights), biases, name='logits') """visual""" tf.add_to_collection('tower_%d_visual' % tower_idx, pre_activation) hidden2 = tf.nn.relu(pre_activation, name=scope.name) # Add fully connected layer 2, activation = None with tf.variable_scope('softmax_layer') as scope: weights = variables.weight_variable( shape=[1024, num_classes], stddev=0.1, verbose=self._hparams.verbose) biases = variables.bias_variable( shape=[num_classes], verbose=self._hparams.verbose) logits = tf.add(tf.matmul(hidden2, weights), biases, name='logits') """visual""" tf.add_to_collection('tower_%d_visual' % tower_idx, logits) # Declare one-hot format placeholder for batched_labels batched_labels = tf.placeholder(tf.int32, shape=[None, num_classes], name='batched_labels') # 'tower_i/batched_labels:0' """visual""" tf.add_to_collection('tower_%d_batched_labels' % tower_idx, batched_labels) return model.Inferred(logits, None)
def conv_slim_capsule(tower_idx, in_tensor, in_dim, in_atoms, out_dim, out_atoms, layer_name, kernel_size=5, stride=2, padding='SAME', reassemble=False, **routing_args): """ Builds a slim convolutional capsule layer. This layer performs 2D convolution given 5R input tensor of shape (batch, in_dim, in_atoms, in_h, in_w). Then refines the votes with routing and applies Squash nonlinearity for each capsule. Each capsule in this layer is a convolutional unit and shares its kernel over its positional grid (e.g. 9x9) and different capsules below. Therefore, number of trainable variables in this layer is: kernel: (kernel_size, kernel_size, in_atoms, out_dim * out_atoms) bias: (out_dim, out_atoms) Output of a conv2d layer is a single capsule with channel number of atoms. Therefore conv_slim_capsule is suitable to be added on top of a conv2d layer with num_routing=1, in_dim=1 and in_atoms = conv_channels. :param tower_idx: the index number for this tower. Each tower is named as tower_{tower_idx} and resides on gpu:{tower_idx} :param in_tensor: 5D tensor, last two dimmensions representing height and width :param in_dim: number of capsule types of input :param in_atoms: number of units of each input capsule :param out_dim: number of capsule types of output :param out_atoms: number of units of each output capsule :param layer_name: name of this layer :param kernel_size: convolutional kernel size [kernel_size, kernel_size] :param stride: stride of the convolutional kernel :param padding: 'SAME' or 'VALID', padding mechanism for convolutional kernels :param **routing_args: dictionary {leaky, num_routing}, args to be passed to the routing procedure :return: tensor of activations for this layer of shape [batch, out_dim, out_atoms, out_h, out_w] """ with tf.variable_scope(layer_name): kernel = variables.weight_variable( shape=[kernel_size, kernel_size, in_atoms, out_dim * out_atoms]) biases = variables.bias_variable(shape=[out_dim, out_atoms, 1, 1]) votes, votes_shape, in_shape = _depthwise_conv3d( tower_idx, in_tensor, in_dim, in_atoms, out_dim, out_atoms, kernel, stride, padding) with tf.name_scope('routing'): logit_shape = tf.stack( [in_shape[0], in_dim, out_dim, votes_shape[2], votes_shape[3]]) biases_replicated = tf.tile(biases, [1, 1, votes_shape[2], votes_shape[3]]) activations = _update_routing(tower_idx, votes=votes, biases=biases_replicated, logit_shape=logit_shape, num_ranks=6, in_dim=in_dim, out_dim=out_dim, reassemble=reassemble, **routing_args) return activations
def capsule(input_tensor, input_dim, output_dim, layer_name, input_atoms=8, output_atoms=8, **routing_args): """Builds a fully connected capsule layer. Given an input tensor of shape `[batch, input_dim, input_atoms]`, this op performs the following: 1. For each input capsule, multiples it with the weight variable to get votes of shape `[batch, input_dim, output_dim, output_atoms]`. 2. Scales the votes for each output capsule by iterative routing. 3. Squashes the output of each capsule to have norm less than one. Each capsule of this layer has one weight tensor for each capsules of layer below. Therefore, this layer has the following number of trainable variables: w: [input_dim * num_in_atoms, output_dim * num_out_atoms] b: [output_dim * num_out_atoms] Args: input_tensor: tensor, activation output of the layer below. input_dim: scalar, number of capsules in the layer below. output_dim: scalar, number of capsules in this layer. layer_name: string, Name of this layer. input_atoms: scalar, number of units in each capsule of input layer. output_atoms: scalar, number of units in each capsule of output layer. **routing_args: dictionary {leaky, num_routing}, args for routing function. Returns: Tensor of activations for this layer of shape `[batch, output_dim, output_atoms]`. """ with tf.variable_scope(layer_name): # weights variable will hold the state of the weights for the layer weights = variables.weight_variable( [input_dim, input_atoms, output_dim * output_atoms]) biases = variables.bias_variable([output_dim, output_atoms]) with tf.name_scope('Wx_plus_b'): # Depthwise matmul: [b, d, c] ** [d, c, o_c] = [b, d, o_c] # To do this: tile input, do element-wise multiplication and reduce # sum over input_atoms dimmension. input_tiled = tf.tile( tf.expand_dims(input_tensor, -1), [1, 1, 1, output_dim * output_atoms]) votes = tf.reduce_sum(input_tiled * weights, axis=2) votes_reshaped = tf.reshape(votes, [-1, input_dim, output_dim, output_atoms]) with tf.name_scope('routing'): input_shape = tf.shape(input_tensor) logit_shape = tf.stack([input_shape[0], input_dim, output_dim]) activations = _update_routing( votes=votes_reshaped, biases=biases, logit_shape=logit_shape, num_dims=4, input_dim=input_dim, output_dim=output_dim, **routing_args) return activations
def conv_slim_capsule(input_tensor, input_dim, output_dim, layer_name, input_atoms=8, output_atoms=8, stride=2, kernel_size=5, padding='SAME', **routing_args): """Builds a slim convolutional capsule layer. This layer performs 2D convolution given 5D input tensor of shape `[batch, input_dim, input_atoms, input_height, input_width]`. Then refines the votes with routing and applies Squash non linearity for each capsule. Each capsule in this layer is a convolutional unit and shares its kernel over the position grid and different capsules of layer below. Therefore, number of trainable variables in this layer is: kernel: [kernel_size, kernel_size, input_atoms, output_dim * output_atoms] bias: [output_dim, output_atoms] Output of a conv2d layer is a single capsule with channel number of atoms. Therefore conv_slim_capsule is suitable to be added on top of a conv2d layer with num_routing=1, input_dim=1 and input_atoms=conv_channels. Args: input_tensor: tensor, of rank 5. Last two dimmensions representing height and width position grid. (128,1,256,20,20) (batch, , channels, img_height, img_width) input_dim: scalar, number of capsules in the layer below. output_dim: scalar, number of capsules in this layer. layer_name: string, Name of this layer. input_atoms: scalar, number of units in each capsule of input layer. output_atoms: scalar, number of units in each capsule of output layer. stride: scalar, stride of the convolutional kernel. kernel_size: scalar, convolutional kernels are [kernel_size, kernel_size]. padding: 'SAME' or 'VALID', padding mechanism for convolutional kernels. **routing_args: dictionary {leaky, num_routing}, args to be passed to the update_routing function. Returns: Tensor of activations for this layer of shape `[batch, output_dim, output_atoms, out_height, out_width]`. If padding is 'SAME', out_height = in_height and out_width = in_width. Otherwise, height and width is adjusted with same rules as 'VALID' in tf.nn.conv2d. """ with tf.variable_scope(layer_name): # layer_name = 'conv_capsule1' kernel = variables.weight_variable(shape=[ kernel_size, kernel_size, input_atoms, output_dim * output_atoms ]) biases = variables.bias_variable([output_dim, output_atoms, 1, 1]) votes, votes_shape, input_shape = _depthwise_conv3d( input_tensor, kernel, input_dim, output_dim, input_atoms, output_atoms, stride, padding) # votes: second convolution result $u_i$ # (128,1,32,8,6,6) 32: output_dims 8: output_atoms # votes_shape (128,256,6,6) # input_shape (128,1,256,20,20) (batch, , channels, img_height, img_width) with tf.name_scope('routing'): logit_shape = tf.stack([ input_shape[0], input_dim, output_dim, votes_shape[2], votes_shape[3] ], name="lc_stack") # (128,1,32,6,6) biases_replicated = tf.tile(biases, [1, 1, votes_shape[2], votes_shape[3]], name="lc_tile") # (32,8,6,6) activations = _update_routing(votes=votes, biases=biases_replicated, logit_shape=logit_shape, num_dims=6, input_dim=input_dim, output_dim=output_dim, **routing_args) return activations
def build_replica(self, tower_idx): """Adds a replica graph ops. Builds the architecture of the neural net to derive logits from batched_dataset. The inference graph defined here should involve trainable variables otherwise the optimizer will raise a ValueError. Args: tower_idx: the index number for this tower. Each tower is named as tower_{tower_idx} and resides on gpu:{tower_idx}. Returns: Inferred namedtuple containing (logits, recons). """ # Image specs image_size = self._specs['image_size'] image_depth = self._specs['depth'] num_classes = self._specs['num_classes'] # Define input_tensor for input batched_images batched_images = tf.placeholder( tf.float32, shape=[None, image_depth, image_size, image_size], name='batched_images') # (?, 3, h, w) """visual""" tf.add_to_collection('tower_%d_batched_images' % tower_idx, batched_images) # declare the threshold placeholder for ensemble evaluation threshold = tf.placeholder(tf.float32, name='threshold') tf.add_to_collection('tower_%d_batched_threshold' % tower_idx, threshold) # ReLU Convolution with tf.variable_scope('conv1') as scope: kernel = variables.weight_variable(shape=[9, 9, image_depth, 256], stddev=5e-2, verbose=self._hparams.verbose) biases = variables.bias_variable([256], verbose=self._hparams.verbose) conv1 = tf.nn.conv2d(batched_images, kernel, strides=[1, 1, 1, 1], padding=self._hparams.padding, data_format='NCHW') pre_activation = tf.nn.bias_add(conv1, biases, data_format='NCHW', name='logits') """visual""" tf.add_to_collection('tower_%d_visual' % tower_idx, pre_activation) relu1 = tf.nn.relu(pre_activation, name=scope.name) if self._hparams.verbose: tf.summary.histogram(scope.name + '/activation', relu1) hidden1 = tf.expand_dims( relu1, 1) # (?, 1, 3, h, w) h,w are different from previous ones. # Capsules capsule_output = self._build_capsule(hidden1, num_classes, tower_idx) logits = tf.norm(capsule_output, axis=-1, name='logits') """visual""" tf.add_to_collection('tower_%d_visual' % tower_idx, logits) # Declare one-hot format placeholder for batched_labels batched_labels = tf.placeholder(tf.int32, shape=[None, num_classes], name='batched_labels') tf.add_to_collection('tower_%d_batched_labels' % tower_idx, batched_labels) # Reconstruction remake = None if self._hparams.remake: remake = self._remake(capsule_output, batched_images, batched_labels) tf.add_to_collection('tower_%d_recons' % tower_idx, remake) else: remake = None return model.Inferred(logits, remake)
def conv_slim_capsule(input_tensor, input_dim, output_dim, layer_name, input_atoms=8, output_atoms=8, stride=2, kernel_size=5, padding='SAME', **routing_args): """Builds a slim convolutional capsule layer. This layer performs 2D convolution given 5D input tensor of shape `[batch, input_dim, input_atoms, input_height, input_width]`. Then refines the votes with routing and applies Squash non linearity for each capsule. Each capsule in this layer is a convolutional unit and shares its kernel over the position grid and different capsules of layer below. Therefore, number of trainable variables in this layer is: kernel: [kernel_size, kernel_size, input_atoms, output_dim * output_atoms] bias: [output_dim, output_atoms] Output of a conv2d layer is a single capsule with channel number of atoms. Therefore conv_slim_capsule is suitable to be added on top of a conv2d layer with num_routing=1, input_dim=1 and input_atoms=conv_channels. Args: input_tensor: tensor, of rank 5. Last two dimmensions representing height and width position grid. input_dim: scalar, number of capsules in the layer below. output_dim: scalar, number of capsules in this layer. layer_name: string, Name of this layer. input_atoms: scalar, number of units in each capsule of input layer. output_atoms: scalar, number of units in each capsule of output layer. stride: scalar, stride of the convolutional kernel. kernel_size: scalar, convolutional kernels are [kernel_size, kernel_size]. padding: 'SAME' or 'VALID', padding mechanism for convolutional kernels. **routing_args: dictionary {leaky, num_routing}, args to be passed to the update_routing function. Returns: Tensor of activations for this layer of shape `[batch, output_dim, output_atoms, out_height, out_width]`. If padding is 'SAME', out_height = in_height and out_width = in_width. Otherwise, height and width is adjusted with same rules as 'VALID' in tf.nn.conv2d. """ with tf.variable_scope(layer_name): kernel = variables.weight_variable(shape=[ kernel_size, kernel_size, input_atoms, output_dim * output_atoms ]) biases = variables.bias_variable([output_dim, output_atoms, 1, 1]) votes, votes_shape, input_shape = _depthwise_conv3d( input_tensor, kernel, input_dim, output_dim, input_atoms, output_atoms, stride, padding) with tf.name_scope('routing'): logit_shape = tf.stack([ input_shape[0], input_dim, output_dim, votes_shape[2], votes_shape[3] ]) biases_replicated = tf.tile(biases, [1, 1, votes_shape[2], votes_shape[3]]) activations = _update_routing( votes=votes, biases=biases_replicated, logit_shape=logit_shape, num_dims=6, input_dim=input_dim, output_dim=output_dim, **routing_args) return activations