def call(self, inputs, training=None): def _l2normalize(v, eps=1e-12): return v / (K.sum(v**2)**0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v if self.spectral_normalization: W_shape = self.kernel.shape.as_list() # Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) # Calculate Sigma sigma = K.dot(_v, W_reshaped) sigma = K.dot(sigma, K.transpose(_u)) # normalize it W_bar = W_reshaped / sigma # reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) # update weitht self.kernel = W_bar if self.rank == 1: outputs = K.conv1d(inputs, self.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add(outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, u_vecs, **kwargs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): c = softmax(b, 1) o = K.batch_dot(c, u_hat_vecs, [2, 2]) if K.backend() == 'theano': o = K.sum(o, axis=1) if i < self.routings - 1: o = K.l2_normalize(o, -1) b = K.batch_dot(o, u_hat_vecs, [2, 3]) if K.backend() == 'theano': b = K.sum(b, axis=1) return self.activation(o)
def call(self, inputs): #? scaled_kernel = self.kernel * self.runtime_coeff if self.rank == 1: kernel = Ke.pad(scaled_kernel , [[1,1], [0,0], [0,0]]) fused_kernel = Ke.add_n([kernel[1:] , kernel[:-1]]) / 2.0 outputs = K.conv1d(inputs , fused_kernel , strides=self.strides[0] , padding=self.padding , data_format=self.data_format , dilation_rate=self.dilation_rate[0]) if self.rank == 2: kernel = Ke.pad(scaled_kernel , [[1,1], [1,1], [0,0], [0,0]]) fused_kernel = Ke.add_n([kernel[1:, 1:] , kernel[:-1, 1:] , kernel[1:, :-1] , kernel[:-1, :-1]]) / 4.0 outputs = K.conv2d(inputs , fused_kernel , strides=self.strides , padding=self.padding , data_format=self.data_format , dilation_rate=self.dilation_rate) if self.rank == 3: kernel = Ke.pad(scaled_kernel , [[1,1], [1,1], [1,1], [0,0], [0,0]]) fused_kernel = Ke.add_n([kernel[1:, 1:, 1:] , kernel[1:, 1:, :-1] , kernel[1:, :-1, 1:] , kernel[1:, :-1, :-1] , kernel[:-1, 1:, 1:] , kernel[:-1, 1:, :-1] , kernel[:-1, :-1, 1:] , kernel[:-1, :-1, :-1]]) / 8.0 outputs = K.conv3d(inputs , fused_kernel , strides=self.strides , padding=self.padding , data_format=self.data_format , dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add(outputs , self.bias , data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, inputs): # Mask kernel with connection matrix masked_kernel = self.kernel * self.connections # Apply convolution if self.rank == 1: outputs = K.conv1d( inputs, masked_kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d( inputs, masked_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d( inputs, masked_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def call(self, inputs, training=None): scaled_kernel = self.kernel * self.runtime_coeff if self.rank == 1: outputs = K.conv1d( inputs, scaled_kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d( inputs, scaled_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d( inputs, scaled_kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: outputs = self.activation(outputs) return outputs
def attention(self, pre_q, pre_v, pre_k, out_seq_len: int, d_model: int, training=None): """ Calculates the output of the attention once the affine transformations of the inputs are done. Here's the shapes of the arguments: :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads) :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads) :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads) :param out_seq_len: the length of the output sequence :param d_model: dimensionality of the model (by the paper) :param training: Passed by Keras. Should not be defined manually. Optional scalar tensor indicating if we're in training or inference phase. """ # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads) q = K.permute_dimensions(pre_q, [0, 2, 1, 3]) v = K.permute_dimensions(pre_v, [0, 2, 1, 3]) if self.compression_window_size is None: k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1]) else: # Memory-compressed attention described in paper # "Generating Wikipedia by Summarizing Long Sequences" # (https://arxiv.org/pdf/1801.10198.pdf) # It compresses keys and values using 1D-convolution which reduces # the size of Q * K_transposed from roughly seq_len^2 # to convoluted_seq_len^2. If we use strided convolution with # window size = 3 and stride = 3, memory requirements of such # memory-compressed attention will be 9 times smaller than # that of the original version. if self.use_masking: raise NotImplementedError( "Masked memory-compressed attention has not " "been implemented yet") k = K.permute_dimensions(pre_k, [0, 2, 1, 3]) k, v = [ K.reshape( # Step 3: Return the result to its original dimensions # (batch_size, num_heads, seq_len, d_model//heads) K.bias_add( # Step 3: ... and add bias K.conv1d( # Step 2: we "compress" K and V using strided conv K.reshape( # Step 1: we reshape K and V to # (batch + num_heads, seq_len, d_model//heads) item, (-1, K.int_shape(item)[-2], d_model // self.num_heads)), kernel, strides=self.compression_window_size, padding='valid', data_format='channels_last'), bias, data_format='channels_last'), # new shape K.concatenate([ K.shape(item)[:2], [-1, d_model // self.num_heads]])) for item, kernel, bias in ( (k, self.k_conv_kernel, self.k_conv_bias), (v, self.v_conv_kernel, self.v_conv_bias))] k_transposed = K.permute_dimensions(k, [0, 1, 3, 2]) # shaping K into (batch_size, num_heads, d_model//heads, seq_len) # for further matrix multiplication sqrt_d = K.constant(np.sqrt(d_model // self.num_heads), dtype=K.floatx()) q_shape = K.int_shape(q) k_t_shape = K.int_shape(k_transposed) v_shape = K.int_shape(v) # before performing batch_dot all tensors are being converted to 3D # shape (batch_size * num_heads, rows, cols) to make sure batch_dot # performs identically on all backends attention_heads = K.reshape( K.batch_dot( self.apply_dropout_if_needed( K.softmax( self.mask_attention_if_needed( K.batch_dot( K.reshape(q, (-1,) + q_shape[-2:]), K.reshape(k_transposed, (-1,) + k_t_shape[-2:])) / sqrt_d)), training=training), K.reshape(v, (-1,) + v_shape[-2:])), (-1, self.num_heads, q_shape[-2], v_shape[-1])) attention_heads_merged = K.reshape( K.permute_dimensions(attention_heads, [0, 2, 1, 3]), (-1, d_model)) attention_out = K.reshape( K.dot(attention_heads_merged, self.output_weights), (-1, out_seq_len, d_model)) return attention_out