def forward( self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, key_mask: Optional[torch.Tensor] = None, number_of_keys: int = -1, number_of_queries: int = -1, ) -> Tuple[torch.Tensor, torch.Tensor]: b = -1 # the batch size # This is to avoid using .size() when possible as Barracuda does not support n_q = number_of_queries if number_of_queries != -1 else query.size(1) n_k = number_of_keys if number_of_keys != -1 else key.size(1) query = self.fc_q(query) # (b, n_q, h*d) key = self.fc_k(key) # (b, n_k, h*d) value = self.fc_v(value) # (b, n_k, h*d) query = query.reshape(b, n_q, self.n_heads, self.embedding_size) key = key.reshape(b, n_k, self.n_heads, self.embedding_size) value = value.reshape(b, n_k, self.n_heads, self.embedding_size) query = query.permute([0, 2, 1, 3]) # (b, h, n_q, emb) # The next few lines are equivalent to : key.permute([0, 2, 3, 1]) # This is a hack, ONNX will compress two permute operations and # Barracuda will not like seeing `permute([0,2,3,1])` key = key.permute([0, 2, 1, 3]) # (b, h, emb, n_k) key -= 1 key += 1 key = key.permute([0, 1, 3, 2]) # (b, h, emb, n_k) qk = torch.matmul(query, key) # (b, h, n_q, n_k) if key_mask is None: qk = qk / (self.embedding_size**0.5) else: key_mask = key_mask.reshape(b, 1, 1, n_k) qk = (1 - key_mask) * qk / (self.embedding_size** 0.5) + key_mask * self.NEG_INF att = torch.softmax(qk, dim=3) # (b, h, n_q, n_k) value = value.permute([0, 2, 1, 3]) # (b, h, n_k, emb) value_attention = torch.matmul(att, value) # (b, h, n_q, emb) value_attention = value_attention.permute([0, 2, 1, 3]) # (b, n_q, h, emb) value_attention = value_attention.reshape( b, n_q, self.n_heads * self.embedding_size) # (b, n_q, h*emb) out = self.fc_out(value_attention) # (b, n_q, emb) return out, att
def update(self, vector_input: torch.Tensor) -> None: steps_increment = vector_input.size()[0] total_new_steps = self.normalization_steps + steps_increment input_to_old_mean = vector_input - self.running_mean new_mean = self.running_mean + (input_to_old_mean / total_new_steps).sum(0) input_to_new_mean = vector_input - new_mean new_variance = self.running_variance + (input_to_new_mean * input_to_old_mean).sum(0) # Update in-place self.running_mean.data.copy_(new_mean.data) self.running_variance.data.copy_(new_variance.data) self.normalization_steps.data.copy_(total_new_steps.data)
def update(self, vector_input: torch.Tensor) -> None: with torch.no_grad(): steps_increment = vector_input.size()[0] total_new_steps = self.normalization_steps + steps_increment input_to_old_mean = vector_input - self.running_mean new_mean: torch.Tensor = self.running_mean + ( input_to_old_mean / total_new_steps).sum(0) input_to_new_mean = vector_input - new_mean new_variance = self.running_variance + (input_to_new_mean * input_to_old_mean).sum(0) # Update references. This is much faster than in-place data update. self.running_mean: torch.Tensor = new_mean self.running_variance: torch.Tensor = new_variance self.normalization_steps: torch.Tensor = total_new_steps