def routing(self, x, b_IJ, W,batch_size,routing_iter): x1 = x.view(batch_size, 256, 1, 6, 6) x_tile = x1.repeat(1, 1, 10, 1, 1) x_view = x_tile.view(batch_size, 1152, 10, 8, 1) stride_i = W.repeat(batch_size, 1, 1, 1, 1) stride_j = stride_i.view(batch_size, 1152, 10, 16, 8) dot_op = torch.matmul(stride_j, x_view) dot_op_stopped = Variable(dot_op.data.clone(), requires_grad=False) for r_iter in range(routing_iter): id_capsule = F.softmax(b_IJ, dim=2) if r_iter == routing_iter - 1: route_I = torch.mul(id_capsule, dot_op) route_I_sum = torch.sum(route_I, dim=1, keepdim=True) + self.bias V_J = squash(route_I_sum,self.epsilon) if r_iter < routing_iter - 1: dot_op_stopped_tmp = dot_op_stopped.data.numpy() dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 16, 1)) id_capsule_tmp = id_capsule.data.numpy() route_I_tmp = id_capsule_tmp * dot_op_stopped_tmp route_I_tmp_sum = np.sum(route_I_tmp, axis=1, keepdims=True) + self.bias.data.numpy() V_J_tmp = squash(torch.Tensor(route_I_tmp_sum),self.epsilon) V_J_tmp_tiled = np.tile(V_J_tmp.numpy(), (1, 1152, 1, 1, 1)) dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 1, 16)) u_produce_v = np.matmul(dot_op_stopped_tmp, V_J_tmp_tiled) b_IJ.data += torch.Tensor(u_produce_v) return V_J
def forward(self, x): # x.size=[batch, in_num_caps, in_dim_caps] # expanded to [batch, 1, in_num_caps, in_dim_caps, 1] # weight.size =[ out_num_caps, in_num_caps, out_dim_caps, in_dim_caps] # torch.matmul: [out_dim_caps, in_dim_caps] x [in_dim_caps, 1] -> [out_dim_caps, 1] # => x_hat.size =[batch, out_num_caps, in_num_caps, out_dim_caps] x_hat = torch.squeeze(torch.matmul(self.weight, x[:, None, :, :, None]), dim=-1) # In forward pass, `x_hat_detached` = `x_hat`; # In backward, no gradient can flow from `x_hat_detached` back to `x_hat`. x_hat_detached = x_hat.detach() # The prior for coupling coefficient, initialized as zeros. # b.size = [batch, out_num_caps, in_num_caps] if self.USE_CUDA: b = Variable( torch.zeros(x.size(0), self.out_num_caps, self.in_num_caps)).cuda() else: b = Variable( torch.zeros(x.size(0), self.out_num_caps, self.in_num_caps)) assert self.routings > 0, 'The \'routings\' should be > 0.' for i in range(self.routings): # c.size = [batch, out_num_caps, in_num_caps] c = F.softmax(b, dim=1) # At last iteration, use `x_hat` to compute `outputs` in order to backpropagate gradient if i == self.routings - 1: # c.size expanded to [batch, out_num_caps, in_num_caps, 1 ] # x_hat.size = [batch, out_num_caps, in_num_caps, out_dim_caps] # => outputs.size= [batch, out_num_caps, 1, out_dim_caps] outputs = utils.squash( torch.sum(c[:, :, :, None] * x_hat, dim=-2, keepdim=True)) # outputs = squash(torch.matmul(c[:, :, None, :], x_hat)) # alternative way else: # Otherwise, use `x_hat_detached` to update `b`. No gradients flow on this path. outputs = utils.squash( torch.sum(c[:, :, :, None] * x_hat_detached, dim=-2, keepdim=True)) # outputs = squash(torch.matmul(c[:, :, None, :], x_hat_detached)) # alternative way # outputs.size =[batch, out_num_caps, 1, out_dim_caps] # x_hat_detached.size=[batch, out_num_caps, in_num_caps, out_dim_caps] # => b.size =[batch, out_num_caps, in_num_caps] b = b + torch.sum(outputs * x_hat_detached, dim=-1) return torch.squeeze(outputs, dim=-2)
def routing(self, x): # x : [None, 1152, 10, 16, 1] input_shape = tf.shape(x) # initialize b to zero # b : [None, 1152, 10, 1] b = tf.zeros((input_shape[0], input_shape[1], self.num_capsule, 1)) # routing by aggriement for _ in range(self.routing_iter): #normalize b so it sums to 1 for each capsule of primary layer # c : [None, 1152, 10, 1] c = tf.nn.softmax(b, axis=2) # compute mean capsule #s : [None, 10, 16, 1] s = tf.reduce_sum(tf.multiply(x, c), axis=1, keepdims=True) # normalize capsule so its length is < 1 #v : [None, 10, 16, 1] v = squash(s, axis=2) # update b using aggriment between candidate capsule and computed digit capsules b = b + tf.reduce_sum(tf.multiply(x, v), axis=-1, keepdims=True) return b
def update(self): state = self._state decay = -self.decay_rate * state self._I_syn = np.dot(self._weights, state) state = state + self.tau * (self._bias + decay + self._I_syn) state = squash(state) self._state = state
def forward(self, x): batch_size = x.size(0) # x -> [batch_size, primary_capsule_num,digit_channels primary_vec_length, 1] x = torch.stack([x] * self.digit_channels, dim=2) # W -> [batch_size, primary_capsule_num, digit_channels, digit_vec_length, primary_vec_length] W = torch.stack([self.W]*batch_size, dim=0) #b_ij will be reset everytime forwarded b_ij = Variable(torch.zeros(batch_size, self.primary_capsule_num, self.digit_channels, 1, 1)).cuda() # u_hat -> [batch_size, primary_capsule_num, digit_channels, digit_vec_length, 1] u_hat = torch.matmul(W, x) #in dynamic routing, b_ij will be updated 3 times, in the first two times we just calculate v_ij for update b_ij, the result shouldn't influence other weights since b_ij is still in unstable state, which means that backpro shouldn't be executed for these two iterations, so we use u_hat_stopped to detach gradient backpro, several implementation ignore this u_hat_stopped = u_hat.detach() num_iterations = 3 for iteration in range(num_iterations): # c_ij -> [batch_size, primary_capsule_num, digit_channels, 1, 1] c_ij = softmax(b_ij, dim=3) # s_j -> [batch_size, digit_channels, digit_vec_length, 1] if iteration == num_iterations-1: s_j = (c_ij * u_hat).sum(dim=1) else: s_j = (c_ij * u_hat_stopped).sum(dim=1) # v_j -> [batch_size, digit_channels, digit_vec_length, 1] v_j = squash(s_j, 2) if iteration != num_iterations -1: # update -> [batch_size, primary_capsule_num, digit_channels, 1, 1] update = torch.matmul(torch.transpose(u_hat, -1, -2) , torch.stack([v_j]*self.primary_capsule_num, dim=1)) b_ij = b_ij + update return v_j
def _capsule(self, input, i_c, o_c, idx): """ compute a capsule, conv op with kernel: 9x9, stride: 2, padding: VALID, output channels: 8 per capsule. :arg input: input for computing capsule, shape: [None, w, h, c] i_c: input channels o_c: output channels idx: index of the capsule about to create :return capsule: computed capsule """ with tf.variable_scope('cap_' + str(idx)): w = tf.get_variable('w', shape=[9, 9, i_c, o_c], dtype=tf.float32) cap = tf.nn.conv2d(input, w, [1, 2, 2, 1], padding='VALID', name='cap_conv') if cfg.USE_BIAS: b = tf.get_variable('b', shape=[o_c, ], dtype=tf.float32, initializer=self._b_initializer) cap = cap + b # cap with shape [None, 6, 6, 8] for mnist dataset # use "squash" as its non-linearity. capsule = squash(cap) # capsule with shape: [None, 6, 6, 8] # expand the dimensions to [None, 1, 6, 6, 8] for following concat capsule = tf.expand_dims(capsule, axis=1) # return capsule with shape [None, 1, 6, 6, 8] return capsule
def call(self, u, **kwargs): # Wij @ ui for every batch and every i, j pair; Shapes are as follows: # shape(W) = [incaps(i), outcaps(o), outdim(m), indim(n)] # shape(u) = [B(b), incaps(i), indim(n)] # shape(uhat) = [B(b), incaps(i), outcaps(o), outdim(m)] uhat = tf.einsum('iomn,bin->biom', self.W, u) # Prepare logits for routing; shape of logits is [B, incaps, outcaps] u_shape = u.get_shape().as_list() batch_size = tf.shape(u)[0] prev_caps = u_shape[1] b = tf.zeros([batch_size, prev_caps, self.n_caps], tf.float32) for i in range(self.routing_iters): # We compute softmax over outcaps (i.e. c(i,j) = exp(b(i, j)) / sum_k{exp(b(i, k))}) c = tf.nn.softmax(b, axis=-1) # Next, compute s(j) = sum_i{c(i,j)*uhat(j|i)}; shape of s is [b, outcaps, outdim] # NOTE: Adding self.bias is extension not documented in a paper (probably as implementation detail) s = tf.einsum('biom,bio->bom', uhat, c) + self.bias # Squash s to get v; same shape as s v = U.squash(s) # If not last iteration add contribution to logits; b(i,j) += v(j) . uhat(j|i) if i != self.routing_iters - 1: b += tf.einsum('biom,bom->bio', uhat, v) return v
def no_routing(self, x): """ Get output for each unit. A unit has batch, channels, height, width. An example of a unit output shape is [128, 32, 6, 6] :return: vector output of capsule j """ # Create 8 convolutional unit. # A convolutional unit uses normal convolutional layer with a non-linearity (squash). unit = [self.conv_units[i](x) for i, l in enumerate(self.conv_units)] # Stack all unit outputs. # Stacked of 8 unit output shape: [128, 8, 32, 6, 6] unit = torch.stack(unit, dim=1) batch_size = x.size(0) # Flatten the 32 of 6x6 grid into 1152. # Shape: [128, 8, 1152] unit = unit.view(batch_size, self.num_unit, -1) # Add non-linearity # Return squashed outputs of shape: [128, 8, 1152] return utils.squash( unit, dim=2) # dim 2 is the third dim (1152D array) in our tensor
def __call__(self, inputs, kernel=None, strides=None): """ Here the logic of capsule layers is applied. :param inputs: 4-D input tensor following `NHWC` format [batch, height, width, out_channels] :param kernel: 4-D tensor having shape: [height, width, in_channels, out_channels] :param strides: An integer, would be replicated for both H and W """ if kernel is not None: # PrimaryCaps layer (low-level capsules) conv_out = tf.nn.conv2d(input=inputs, filter=kernel, strides=strides, padding='VALID', name='conv_out') conv_out = tf.nn.relu(conv_out) capsules = tf.reshape( conv_out, shape=[tf.shape(inputs)[0], -1, self.primary_caps_vec_len], name='capsules') # (B,1152,8) squashed_capsules = squash(capsules) return squashed_capsules elif kernel is None and strides is None: # DigitCaps layer # Here we need to apply the routing mechanism return self.routing(inputs) else: raise ValueError( 'kernel_size and strides params should be either both None \ (for DigitCaps) or both not None (for PrimaryCaps)')
def forward(self, x): #input x,shape=[batch_size,in_features,in_dim] #[batch_size,1152,8] # (batch, input_features, in_dim) -> (batch, in_features, out_features,1,in_dim) x = torch.stack([x] * self.out_features, dim=2).unsqueeze(3) W = torch.cat([self.W] * conf.batch_size, dim=0) # u_hat shape->(batch_size,in_features,out_features,out_dim)=(batch,1152,10,1,16) u_hat = torch.matmul(x, W) #b for generate weight c,with shape->[1,1152,10,1] b = torch.zeros([1, self.in_features, self.out_features, 1]).double() if self.cuda: b = b.cuda() b = Variable(b) for i in range(3): c = F.softmax(b, dim=2) #c shape->[batch_size,1152,10,1,1] c = torch.cat([c] * conf.batch_size, dim=0).unsqueeze(dim=4) #s shape->[batch_size,1,10,1,16] s = (u_hat * c).sum(dim=1, keepdim=True) #output shape->[batch_size,1,10,1,16] v = utils.squash(s, dim=-1) v_1 = torch.cat([v] * self.in_features, dim=1) #(batch,1152,10,1,16)matmul(batch,1152,10,16,1)->(batch,1152,10,1,1) #squeeze #mean->(1,1152,10,1) #print u_hat.shape,v_1.shape update_b = torch.matmul(u_hat, v_1.transpose( 3, 4)).squeeze(dim=4).mean(dim=0, keepdim=True) b = b + update_b return v.squeeze(1).transpose(2, 3)
def get_primary_capsules(X): # print("Primary Capsules") # printShape(X) # (?, 28, 28, 1) caps1_n_maps = 32 caps1_n_dims = 8 conv1 = tf.layers.conv2d(X, filters=256, kernel_size=9, strides=1, padding="valid", activation=tf.nn.relu) # printShape(conv1) # (?, 20, 20, 256) # stride of 2! conv2_n_filters = caps1_n_maps * caps1_n_dims conv2 = tf.layers.conv2d(conv1, filters=conv2_n_filters, kernel_size=9, strides=2, padding="valid", activation=tf.nn.relu) # printShape(conv2) # (?, 6, 6, 256) # what we have: 256 feature maps of 6 x 6 scalar values (total: 9216) # what we want: 32 maps of 6x6 vectors (8 dimensions a vector) (total: 9216) # BUT since we are going to be FULLY CONNECTING this to the next layer # we can just make it one long array [32 * 6 * 6, 8] = [1152, 8] = 1152 x 8 = 9216 caps1_n_caps = caps1_n_maps * 6 * 6 # 1152 primary capsules caps1_raw = tf.reshape(conv2, [-1, caps1_n_caps, caps1_n_dims]) # printShape(caps1_raw) # (?, 1152, 8) # squash to keep the vectors under 1 return squash(caps1_raw)
def _build_net(self): """ build the graph of the network arg: self return: none """ # reshape for conv ops with tf.name_scope('x_reshape'): x_image = tf.reshape(self._x, [-1, 28, 28, 1]) # initial conv1 op # 1). conv1 with kernel 9x9, stride 1, output channels 256 with tf.variable_scope('conv1'): # specially initialize it with xavier initializer with no good reason. w = tf.get_variable('w', shape=[9, 9, 1, 256], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer() ) # conv op conv1 = tf.nn.conv2d(x_image, w, [1, 1, 1, 1], padding='VALID', name='conv1') if cfg.USE_BIAS: b = tf.get_variable('b', shape=[256, ], dtype=tf.float32, initializer=self._b_initializer) conv1 = tf.nn.relu(conv1 + b) else: conv1 = tf.nn.relu(conv1) # update dimensions of feature map self._dim = (self._dim - 9) // 1 + 1 assert self._dim == 20, "after conv1, dimensions of feature map" \ "should be 20x20" # conv1 with shape [None, 20, 20, 256] # build up primary capsules with tf.variable_scope('PrimaryCaps'): # update dim of capsule grid self._dim = (self._dim - 9) // 2 + 1 # number of primary caps: 6x6x32 = 1152 self._num_caps.append(self._dim ** 2 * cfg.PRIMARY_CAPS_CHANNELS) assert self._dim == 6, "dims for primary caps grid should be 6x6." # build up PrimaryCaps with 32 channels and 8-D vector primary_caps = slim.conv2d(conv1, 32 * 8, 9, 2, padding='VALID', activation_fn=None) primary_caps = tf.reshape(primary_caps, [-1, 1, self._num_caps[1], 1, 8]) primary_caps = squash(primary_caps) # dynamic routing with tf.variable_scope("digit_caps"): self._digit_caps = self._dynamic_routing(primary_caps, 1) self._digit_caps_norm = tf.norm(self._digit_caps, ord=2, axis=2, name='digit_caps_norm')
def forward(self, x): """it's necessary to mention that the output of convolution can be relued before sending to next layer, in capsnet paper, nothing has mentioned about this, so we just remind you here that it's a possible option, in the tensorflow implementation of capsnet written by naturomics, relu is used""" batch_size = x.size(0) conv = self.distributed_conv_fcn(torch.stack([x] * self.primary_vec_length, dim=1)) # squ -> [batch, primary_capsule_num, primary_vec_length, 1] squ = torch.transpose(squash(conv, dim=1).view(batch_size, self.primary_vec_length, -1), 1, 2) squ = squ.unsqueeze(3) return squ
def call(self, inputs): batch_size = tf.shape(inputs)[0] conv2d_outputs = super(PrimaryCapsConv2D, self).call(inputs) conv2d_outputs_int_shape = conv2d_outputs.get_shape().as_list() caps_outputs = conv2d_outputs_int_shape[1] * conv2d_outputs_int_shape[ 2] * self.n_caps s = tf.reshape(conv2d_outputs, [batch_size, caps_outputs, self.caps_dim]) v = U.squash(s) return v
def forward(self, x): #input x with shape ->[batch_size,in_features,height,width] #output with shape->[batch_size,32,6,6] x = [self.conv[i](x) for i in range(self.out_dim)] #output with shape->[batch_size,8,32,6,6] x = torch.stack(x, dim=1) #return shape->[batch_size,1152,8] x = x.view(x.size(0), self.out_dim, -1).transpose(1, 2) #return shape->[batch_size,1152,8] x = utils.squash(x, dim=2) return x
def body(i, prior, cap_out): c = tf.nn.softmax(prior, axis=1) c_expand = tf.expand_dims(c, axis=-1) s_t = tf.multiply(cap_predictions, c_expand) s = tf.reduce_sum(s_t, axis=[2]) cap_out = squash(s) delta_prior = tf.reduce_sum(tf.multiply(tf.expand_dims(cap_out, axis=2), cap_predictions), axis=[-1]) prior = prior + delta_prior return [i - 1, prior, cap_out]
def __call__(self, input): ''' :param input: 4D tensor :return: ''' conv_output = self._get_conv_output(input) # flatten flatten_caps = tf.reshape(conv_output, [-1, self.n_capsules, self.capsule_length]) # squash to keep the vectors under 1 return squash(flatten_caps)
def routing(self, x): batch_size = x.size(0) x = x.transpose(1, 2) if (not self.high_cap_conv) & (not self.noTM): x = torch.stack([x] * self.num_unit, dim=2).unsqueeze(4) batch_weight = torch.cat([self.weight] * batch_size, dim=0) u_hat = torch.matmul(batch_weight, x) elif self.high_cap_conv & (not self.noTM): if self.single_conv: x = torch.stack([x] * self.num_unit, dim=1).unsqueeze(4) x = x.view(batch_size, -1, self.in_unit, 1) u_hat = self.weight_conv(x) u_hat = (u_hat.view(batch_size, self.num_unit, self.in_channel, u_hat.size(2), 1)).transpose(1, 2).contiguous() else: x = torch.stack([x] * self.num_unit, dim=1).unsqueeze(4) x = x.view(batch_size, -1, self.in_unit, 1) u_hat_1 = self.weight_conv_1(x) u_hat_2 = self.weight_conv_2(x) u_hat_3 = self.weight_conv_3(x) u_hat_4 = self.weight_conv_4(x) u_hat = torch.cat((u_hat_1, u_hat_2, u_hat_3, u_hat_4), 2) u_hat = (u_hat.view(batch_size, self.num_unit, self.in_channel, u_hat.size(2), 1)).transpose(1, 2).contiguous() elif self.noTM: u_hat = x if not self.fc: b_ij = Variable(torch.zeros(1, self.in_channel, self.num_unit, 1)) b_ij = b_ij.cuda() if self.penalty_attention: penalty = torch.cat([self.penalty] * batch_size, dim=0) for iteration in range(self.num_routing): c_ij = F.softmax(b_ij, dim=2) c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4) s_j = (c_ij * u_hat).sum(dim=1, keepdim=True) v_j = utils.squash(s_j, dim=3) if self.penalty_attention: v_j = v_j * penalty v_j1 = torch.cat([v_j] * self.in_channel, dim=1) u_vj1 = torch.matmul(u_hat.transpose(3, 4), v_j1).squeeze(4).mean(dim=0, keepdim=True) b_ij = b_ij + u_vj1 return v_j.squeeze(1) else: v = u_hat.view(batch_size, -1) v = self.fully_connected(v) return v.unsqueeze(2).unsqueeze(3)
def forward(self, x): # Shape of x = [128 x 256 x 20 x 20] # Apply Convolutions to Input x to generate capsules. outputs = [capsule(x) for capsule in self.capsules] # Shape of each output in list = [128 x 8 x 6 x 6 x 1] outputs = torch.cat(outputs, dim=-1) # Shape of outputs = [128 x 8 x 6 x 6 x 32] outputs = outputs.view(outputs.shape[0], outputs.shape[1], -1) # Shape of outputs = [128 x 8 x 1152] outputs = outputs.transpose(1, len(outputs.shape) - 1) # Shape of outputs = [128 x 1152 x 8] outputs = squash(outputs) # Shape of outputs = [128 x 1152 x 8] return outputs
def routing(self, x, b_IJ, W, batch_size, routing_iter): x1 = x.view(batch_size, 256, 1, 6, 6) x_tile = x1.repeat(1, 1, 10, 1, 1) x_view = x_tile.view(batch_size, 1152, 10, 8, 1) stride_i = W.repeat(batch_size, 1, 1, 1, 1) stride_j = stride_i.view(batch_size, 1152, 10, 16, 8) dot_op = torch.matmul(stride_j, x_view) dot_op_stopped = Variable(dot_op.data.clone(), requires_grad=False) for r_iter in range(routing_iter): id_capsule = F.softmax(b_IJ, dim=2) if r_iter == routing_iter - 1: route_I = torch.mul(id_capsule, dot_op) route_I_sum = torch.sum(route_I, dim=1, keepdim=True) + self.bias V_J = squash(route_I_sum, self.epsilon) if r_iter < routing_iter - 1: dot_op_stopped_tmp = dot_op_stopped.data.numpy() dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 16, 1)) id_capsule_tmp = id_capsule.data.numpy() route_I_tmp = id_capsule_tmp * dot_op_stopped_tmp route_I_tmp_sum = np.sum( route_I_tmp, axis=1, keepdims=True) + self.bias.data.numpy() V_J_tmp = squash(torch.Tensor(route_I_tmp_sum), self.epsilon) V_J_tmp_tiled = np.tile(V_J_tmp.numpy(), (1, 1152, 1, 1, 1)) dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 1, 16)) u_produce_v = np.matmul(dot_op_stopped_tmp, V_J_tmp_tiled) b_IJ.data += torch.Tensor(u_produce_v) return V_J
def forward(self, x): """ Forward pass Args: x (FloatTensor): Input image of shape [batch_size, in_channels, height_input, width_input] Returns: caps_raw (FloatTensor): Primary capsules in grid of shape [batch_size, out_channels, width grid, height grid, vec_len]. """ features = self.conv(x) _, _, h, w = features.shape caps_raw = features.contiguous().view(-1, self.out_channels, self.vector_length, h, w) # [b, c, vec, h, w] caps_raw = caps_raw.permute(0, 1, 3, 4, 2) # [b, c, h, w, vec] # squash on the vector dimension return squash(caps_raw)
def routing(self, x): """ Routing algorithm for capsule. :return: vector output of capsule j """ batch_size = x.size(0) x = x.transpose(1, 2) x = torch.stack([x] * self.num_unit, dim=2).unsqueeze(4) weight = torch.cat([self.weight] * batch_size, dim=0) # Transform inputs by weight matrix. u_hat = torch.matmul(weight, x) # All the routing logits (b_ij in the paper) are initialized to zero. b_ij = Variable(torch.zeros(1, self.in_channel, self.num_unit, 1)) if self.cuda_enabled: b_ij = b_ij.cuda() # From the paper in the "Capsules on MNIST" section, # the sample MNIST test reconstructions of a CapsNet with 3 routing iterations. num_iterations = self.num_routing for iteration in range(num_iterations): # Routing algorithm # Calculate routing or also known as coupling coefficients (c_ij). c_ij = F.softmax(b_ij) # Convert routing logits (b_ij) to softmax. c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4) # Implement equation 2 in the paper. # u_hat is weighted inputs s_j = (c_ij * u_hat).sum(dim=1, keepdim=True) v_j = utils.squash(s_j) v_j1 = torch.cat([v_j] * self.in_channel, dim=1) u_vj1 = torch.matmul(u_hat.transpose(3, 4), v_j1).squeeze(4).mean(dim=0, keepdim=True) # Update routing (b_ij) b_ij = b_ij + u_vj1 return v_j.squeeze(1)
def no_routing(self, x): """ Get output for each unit. A unit has batch, channels, height, width. :return: vector output of capsule j """ # unit = [self.conv_units[i](x) for i in range(self.num_unit)] unit = [self.conv_units[i](x) for i, l in enumerate(self.conv_units)] # Stack all unit outputs. unit = torch.stack(unit, dim=1) # Flatten unit = unit.view(x.size(0), self.num_unit, -1) # Return squashed outputs. return utils.squash(unit)
def _routing_round(previous_weights, digit_caps_prediction): # print(": routing weights = softmax on previous weights") routing_weights = tf.nn.softmax(previous_weights, dim=2) # (?, 1152, 10, 1, 1) # print(": weighted predictions = routing weights x digit caps prediction") weighted_predictions = tf.multiply(routing_weights, digit_caps_prediction) # (?, 1152, 10, 16, 1) # Q: When getting weighted predictions why is there no bias ? # print(": reduce sum of all of them (collapse `rows`)") weighted_sum = tf.reduce_sum(weighted_predictions, axis=1, keep_dims=True) # (?, 1 , 10, 16, 1) # print(": squash to keep below 1") round_output = squash(weighted_sum, axis=-2) # (?, 1 , 10, 16, 1) return round_output
def forward(self, x): # Shape of x = [2 x 23936 x 8] # Shape of x = [1 x 2 x 23936 x 1 x 8], Shape of Weights = [40 x 1 x 23936 x 8 x 32] x_hat = torch.matmul(x[None, :, :, None, :], self.weights[:, None, :, :, :]) # Shape of x_hat = [40 x 2 x 23936 x 1 x 32] # b is a temporary variable that will store the value of routing weights c and will be gradually updated. b = torch.zeros(*x_hat.shape).to(device) for i in range(self.routing_iterations): # Routing weights for all capsules of layer l (i.e dim_2 = 23936) c = softmax(b, dim=2) # Weighted sum of x_hat and routing weights c across all capsules of layer l (i.e. Sum over dim_2 = 23936) outputs = squash((x_hat*c).sum(dim=2, keepdim=True)) # Shape of outputs = [40 x 2 x 1 x 1 x 32] if(i != self.routing_iterations-1): # Weight Update Step: Update weight b using dot product similarity. db = (x_hat * outputs).sum(dim=-1, keepdim=True) # Shape of db = [40 x 2 x 23936 x 1 x 1] b = b + db return outputs
def forward(self, input): # TODO: make it work for batch sizes > 1 _, in_channels, h, w = input.size() assert in_channels == self.num_shared * self.in_dim input = input.squeeze().view(self.num_shared, -1, self.in_dim) groups = input.chunk(self.num_shared) u = [group.squeeze().chunk(h * w) for group in groups] pred = [ self.W[i](in_vec.squeeze()) for i, group in enumerate(u) for in_vec in group ] pred = torch.stack([torch.stack(p) for p in pred]).view(self.num_shared * h * w, -1) c = F.softmax(self.b) s = torch.matmul(c, pred) v = squash(s.t()) self.b = torch.add(self.b, torch.matmul(pred, v)) return v
def forward(self, pose): # x: [b, AC, h, w] b, _, h, w = pose.shape # [b, ACkk, l] pose = F.unfold(pose, self.k, stride=self.stride, padding=self.pad) l = pose.shape[-1] # [b, A, C, kk, l] pose = pose.view(b, self.A, self.C, self.kk, l) # [b, l, kk, A, C] pose = pose.permute(0, 4, 3, 1, 2).contiguous() # [b, l, kkA, C, 1] pose = pose.view(b, l, self.kkA, self.C, 1) # [b, l, kkA, BD] pose_out = torch.matmul(self.W, pose).squeeze(-1) # [b, l, kkA, B, D] pose_out = pose_out.view(b, l, self.kkA, self.B, self.D) # [b, l, kkA, B, 1] b = pose.new_zeros(b, l, self.kkA, self.B, 1) for i in range(self.iters): c = torch.softmax(b, dim=3) # [b, l, 1, B, D] s = (c * pose_out).sum(dim=2, keepdim=True) # [b, l, 1, B, D] v = squash(s) b = b + (v * pose_out).sum(dim=-1, keepdim=True) # [b, l, B, D] v = v.squeeze(2) # [b, l, BD] v = v.view(v.shape[0], l, -1) # [b, BD, l] v = v.transpose(1, 2).contiguous() oh = ow = math.floor(l**(1 / 2)) # [b, BD, oh, ow] return v.view(v.shape[0], -1, oh, ow)
def no_routing(self, x): batch_size = x.size(0) unit = self.conv_units(x) spatial_size = unit.size(2) if self.regrouping_type == 'local': unit = self.regrouping_local(unit) elif self.regrouping_type == 'adjacent': unit = self.regrouping_adjacent(unit) elif self.regrouping_type == 'shuffle': unit = self.regrouping_shuffle(unit, batch_size, spatial_size) if self.group_attention: attention_weight = self.attention(unit) attention_weight = torch.stack([attention_weight] * self.num_unit, dim=1) unit = unit.view(batch_size, self.group_num, self.num_unit, spatial_size, spatial_size).transpose(1, 2).contiguous() unit = attention_weight * unit unit = unit.view(batch_size, self.num_unit, -1) return utils.squash(unit, dim=2)
def _dynamic_routingV1(self, prior, cap_predictions): """ doing dynamic routing with for loop as static implementation :arg proir: log prior for scaling with shape [10, num_caps] cap_prediction: predictions from layer below with shape [None, 10, num_caps, 16] :return digit_caps: digit capsules with shape [None, 10, 16] """ prior = tf.expand_dims(prior, 0) # prior shape: [1, 10, num_caps] for idx in xrange(cfg.ROUTING_ITERS): with tf.name_scope('routing_%s' % idx): c = tf.nn.softmax(prior, dim=1) # c shape: [1, 10, num_caps] c_t = tf.expand_dims(c, axis=-1) # c_t shape: [1, 10, num_caps, 1] s_t = tf.multiply(cap_predictions, c_t) # s_t shape: [None, 10, num_caps, 16] # for each capsule in the layer after, add all the weighted capsules to get # the capsule input for it. # s_j = Sum_i (c_ij u_hat_j|i) s = tf.reduce_sum(s_t, axis=[2]) # s shape: [None, 10, 16] digit_caps = squash(s) # digit_caps shape: [None, 10, 16] # u_hat_j|i * v_j delta_prior = tf.reduce_sum(tf.multiply(tf.expand_dims(digit_caps, axis=2), cap_predictions), axis=[-1]) # delta_prior shape: [None, 10, num_caps] prior = prior + delta_prior # shape [None, 10, 16] return digit_caps
def forward(self, obs, compute_pi=True, compute_log_pi=True): mu, log_std = self.trunk(obs).chunk(2, dim=-1) # constrain log_std inside [log_std_min, log_std_max] log_std = torch.tanh(log_std) log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1) if compute_pi: std = log_std.exp() noise = torch.randn_like(mu) pi = mu + noise * std else: pi = None if compute_log_pi: log_pi = gaussian_logprob(noise, log_std) else: log_pi = None mu, pi, log_pi = squash(mu, pi, log_pi) return mu, pi, log_pi, log_std
def call(self, x): # shape [None, 1152, 8] -> [None, 1152, 1, 8, 1] x = tf.expand_dims(x, axis=2) x = tf.expand_dims(x, axis=-1) # compute candidate capsule for all pair of primary and digit capsule #x : [None, 1152, 1, 8, 1], weight : [1, 1152, 10, 16, 8] -> u : [None, 1152, 10, 16, 1] u = tf.squeeze(tf.matmul(self.weight, x), axis=-1) # stop the gradients on u to obtaine routing coeffiant #b : [None, 1152, 10, 1] b = self.routing(tf.stop_gradient(u)) # normalize b so it sums to 1 for each capsule of primary layer #c : [None, 1152, 10, 1] c = tf.nn.softmax(b, axis=2) # compute mean capsule #s : [None, 10, 16] s = tf.reduce_sum(tf.multiply(u, c), axis=1) # normalize capsule so its length is < 1 return squash(s, axis=-1)