def __call__(self, start_logits: torch.Tensor, end_logits: torch.Tensor, match_logits: torch.Tensor, mask: torch.BoolTensor) -> torch.LongTensor: mask = mask.bool() batch_size, seq_len = start_logits.size() # match label pred, [batch_size, seq_len, seq_len] match_preds = match_logits > 0 # mask 保留 match_preds 或者 start, end 其中之一即可 match_preds = match_preds \ & mask.unsqueeze(-1).expand(-1, -1, seq_len) \ & mask.unsqueeze(1).expand(-1, seq_len, -1) # [batch_size, seq_len] start_preds = start_logits > 0 start_preds = start_preds & mask # [batch_size, seq_len] end_preds = end_logits > 0 end_preds = end_preds & mask # match label 最终结果 match_preds = (match_preds & start_preds.unsqueeze(-1).expand(-1, -1, seq_len) & end_preds.unsqueeze(1).expand(-1, seq_len, -1)) return match_preds
def _construct_loss( self, arc_scores: torch.Tensor, arc_tag_logits: torch.Tensor, arc_tags: torch.Tensor, mask: torch.BoolTensor, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Computes the arc and tag loss for an adjacency matrix. # Parameters arc_scores : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length) used to generate a binary classification decision for whether an edge is present between two words. arc_tag_logits : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length, num_tags) used to generate a distribution over edge tags for a given edge. arc_tags : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length). The labels for every arc. mask : `torch.BoolTensor`, required. A mask of shape (batch_size, sequence_length), denoting unpadded elements in the sequence. # Returns arc_nll : `torch.Tensor`, required. The negative log likelihood from the arc loss. tag_nll : `torch.Tensor`, required. The negative log likelihood from the arc tag loss. """ arc_indices = (arc_tags != -1).float() # Make the arc tags not have negative values anywhere # (by default, no edge is indicated with -1). arc_tags = arc_tags * arc_indices arc_nll = self._arc_loss( arc_scores, arc_indices) * mask.unsqueeze(1) * mask.unsqueeze(2) # We want the mask for the tags to only include the unmasked words # and we only care about the loss with respect to the gold arcs. tag_mask = mask.unsqueeze(1) * mask.unsqueeze(2) * arc_indices batch_size, sequence_length, _, num_tags = arc_tag_logits.size() original_shape = [batch_size, sequence_length, sequence_length] reshaped_logits = arc_tag_logits.view(-1, num_tags) reshaped_tags = arc_tags.view(-1) tag_nll = (self._tag_loss(reshaped_logits, reshaped_tags.long()).view(original_shape) * tag_mask) valid_positions = tag_mask.sum() arc_nll = arc_nll.sum() / valid_positions.float() tag_nll = tag_nll.sum() / valid_positions.float() return arc_nll, tag_nll
def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor = None): if mask is not None: tokens = tokens * mask.unsqueeze(-1) # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens` # dimension. summed = tokens.sum(1) if self._averaged: if mask is not None: lengths = get_lengths_from_binary_sequence_mask(mask) length_mask = lengths > 0 # Set any length 0 to 1, to avoid dividing by zero. lengths = torch.max(lengths, lengths.new_ones(1)) else: lengths = tokens.new_full((1, ), fill_value=tokens.size(1)) length_mask = None summed = summed / lengths.unsqueeze(-1).float() if length_mask is not None: summed = summed * (length_mask > 0).unsqueeze(-1) return summed
def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor): if mask is not None: tokens = tokens * mask.unsqueeze(-1) # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`. The # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`, # where the conv layer `in_channels` is our `embedding_dim`. We thus need to transpose the # tensor first. tokens = torch.transpose(tokens, 1, 2) # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`, # where `pool_length = num_tokens - ngram_size + 1`. We then do an activation function, # then do max pooling over each filter for the whole input sequence. Because our max # pooling is simple, we just use `torch.max`. The resultant tensor of has shape # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the # projection layer, if requested. filter_outputs = [] for i in range(len(self._convolution_layers)): convolution_layer = getattr(self, "conv_layer_{}".format(i)) filter_outputs.append( self._activation(convolution_layer(tokens)).max(dim=2)[0]) # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`. # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`. maxpool_output = (torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]) if self.projection_layer: result = self.projection_layer(maxpool_output) else: result = maxpool_output return result
def forward( self, previous_state: Dict[str, torch.Tensor], encoder_outputs: torch.Tensor, source_mask: torch.BoolTensor, previous_steps_predictions: torch.Tensor, previous_steps_mask: Optional[torch.BoolTensor] = None, ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: source_mask = source_mask.unsqueeze(-2) future_mask = Variable( subsequent_mask(previous_steps_predictions.size(-2), device=source_mask.device).type_as( source_mask.data)) if previous_steps_mask is None: previous_steps_mask = future_mask else: previous_steps_mask = previous_steps_mask.unsqueeze( -2) & future_mask previous_steps_predictions = previous_steps_predictions * self._embed_scale if self._positional_embedder: previous_steps_predictions = self._positional_embedder( previous_steps_predictions) previous_steps_predictions = self._dropout(previous_steps_predictions) decoded = self._self_attention(previous_steps_predictions, encoder_outputs, source_mask, previous_steps_mask) return {}, decoded
def forward( self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.BoolTensor = None, ) -> torch.Tensor: if mask is not None: # Same mask applied to all h heads. # Shape (batch_size, num_heads, timesteps, timesteps) mask = mask.unsqueeze(1).expand([-1, self.num_heads, -1, -1]) nbatches = query.size(0) # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [ layer(x).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2) for layer, x in zip(self.linears, (query, key, value)) ] # 2) Apply attention on all the projected vectors in batch. x, _ = attention(query, key, value, mask=mask, dropout=self.dropout) # 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.num_heads * self.d_k) return self.linears[-1](x)
def masked_log_softmax(vector: torch.Tensor, mask: torch.BoolTensor, dim: int = -1) -> torch.Tensor: """ `torch.nn.functional.log_softmax(vector)` does not work if some elements of `vector` should be masked. This performs a log_softmax on just the non-masked portions of `vector`. Passing `None` in for the mask is also acceptable; you'll just get a regular log_softmax. `vector` can have an arbitrary number of dimensions; the only requirement is that `mask` is broadcastable to `vector's` shape. If `mask` has fewer dimensions than `vector`, we will unsqueeze on dimension 1 until they match. If you need a different unsqueezing of your mask, do it yourself before passing the mask into this function. In the case that the input vector is completely masked, the return value of this function is arbitrary, but not `nan`. You should be masking the result of whatever computation comes out of this in that case, anyway, so the specific values returned shouldn't matter. Also, the way that we deal with this case relies on having single-precision floats; mixing half-precision floats with fully-masked vectors will likely give you `nans`. If your logits are all extremely negative (i.e., the max value in your logit vector is -50 or lower), the way we handle masking here could mess you up. But if you've got logit values that extreme, you've got bigger problems than this. """ if mask is not None: while mask.dim() < vector.dim(): mask = mask.unsqueeze(1) # vector + mask.log() is an easy way to zero out masked elements in logspace, but it # results in nans when the whole vector is masked. We need a very small value instead of a # zero in the mask for these cases. vector = vector + (mask + 1e-30).log() return torch.nn.functional.log_softmax(vector, dim=dim)
def forward(self, tensors: List[torch.Tensor], mask: torch.BoolTensor = None) -> torch.Tensor: """ Compute a weighted average of the `tensors`. The input tensors an be any shape with at least two dimensions, but must all be the same shape. When `do_layer_norm=True`, the `mask` is required input. If the `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape tensors: [batch_size, timesteps, dim] mask shape: [batch_size, timesteps] When `do_layer_norm=False` the `mask` is ignored. """ assert (len(tensors) == self.mixture_size) def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): tensor_masked = tensor * broadcast_mask mean = torch.sum(tensor_masked) / num_elements_not_masked sum_value = torch.sum(((tensor_masked - mean) * broadcast_mask)**2) # sum_value_max= torch.finfo(sum_value.dtype).max # sum_value = sum_value if sum_value < sum_value_max else sum_value_max variance = (sum_value / num_elements_not_masked) return (tensor - mean) / torch.sqrt(variance + tiny_value_of_dtype(variance.dtype)) normed_weights = torch.nn.functional.softmax(torch.cat( [parameter for parameter in self.scalar_parameters]), dim=0) if torch.cuda.is_available(): normed_weights = normed_weights.cuda() self.gamma = self.gamma.to('cuda') normed_weights = torch.split(normed_weights, split_size_or_sections=1) if not self.do_layer_norm: pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * tensor) return self.gamma * sum(pieces) else: broadcast_mask = mask.unsqueeze(-1) input_dim = tensors[0].size(-1) # pad对应的位置是0, token对应的是1, sum(mask) token 的总个数, input_dim:768 num_elements_not_masked = torch.sum(mask) * input_dim pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked)) # print("gamma:",self.gamma) # print("gamma:",self.gamma.grad) # print("gamma:",self.gamma.requires_grad) # print("normed_weights:", (normed_weights[0])) # print("normed_weights:", (normed_weights[0]).requires_grad) # print("normed_weights:", (normed_weights[0]).grad) return self.gamma * sum(pieces)
def batched_index_or(batched_set : torch.BoolTensor, mapping : torch.BoolTensor) -> torch.BoolTensor: """ batched_set : shape (batch_size, set capacity) mapping : shape (batch_size, set capacity, "constants") returns a bool tensor R of shape (batch_size, "constants") R[b,c] = True iff \exists l, batched_set[b,l] AND mapping[b,l,c] """ result = (torch.bmm(batched_set.unsqueeze(1), mapping)).squeeze(1) #shape (batch_size, "lexical types") return result > 0
def _greedy_decode( self, head_tag_representation: torch.Tensor, child_tag_representation: torch.Tensor, attended_arcs: torch.Tensor, mask: torch.BoolTensor, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Decodes the head and head tag predictions by decoding the unlabeled arcs independently for each word and then again, predicting the head tags of these greedily chosen arcs independently. Note that this method of decoding is not guaranteed to produce trees (i.e. there maybe be multiple roots, or cycles when children are attached to their parents). # Parameters head_tag_representation : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, tag_representation_dim), which will be used to generate predictions for the dependency tags for the given arcs. child_tag_representation : `torch.Tensor`, required A tensor of shape (batch_size, sequence_length, tag_representation_dim), which will be used to generate predictions for the dependency tags for the given arcs. attended_arcs : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length) used to generate a distribution over attachments of a given word to all other words. # Returns heads : `torch.Tensor` A tensor of shape (batch_size, sequence_length) representing the greedily decoded heads of each word. head_tags : `torch.Tensor` A tensor of shape (batch_size, sequence_length) representing the dependency tags of the greedily decoded heads of each word. """ # Mask the diagonal, because the head of a word can't be itself. attended_arcs = attended_arcs + torch.diag( attended_arcs.new(mask.size(1)).fill_(-numpy.inf) ) # Mask padded tokens, because we only want to consider actual words as heads. if mask is not None: minus_mask = ~mask.unsqueeze(2) attended_arcs.masked_fill_(minus_mask, -numpy.inf) # Compute the heads greedily. # shape (batch_size, sequence_length) _, heads = attended_arcs.max(dim=2) # Given the greedily predicted heads, decode their dependency tags. # shape (batch_size, sequence_length, num_head_tags) head_tag_logits = self._get_head_tags( head_tag_representation, child_tag_representation, heads ) _, head_tags = head_tag_logits.max(dim=2) return heads, head_tags
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.BoolTensor): """[summary] Parameters ---------- query : torch.Tensor [shape : (batch_size, query_len, hidden_dim)] key : torch.Tensor [shape : (batch_size, key_len, hidden_dim)] value : torch.Tensor [shape : (batch_size, key_len, hidden_dim)] mask : torch.BoolTensor [shape : (batch_size, query_len, key_len)] """ batch_size = query.shape[0] Q, K, V = self.fc_query(query), self.fc_key(key), self.fc_value(value) #query = [batch size, query len, hid dim] #key = [batch size, key len, hid dim] #value = [batch size, key len, hid dim] Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) #Q = [batch size, n heads, query len, head dim] #K = [batch size, n heads, key len, head dim] #V = [batch size, n heads, value len, head dim] energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale #energy = [batch size, n heads, query len, key len] if mask is not None: # mask (batch_size, query_len, key_len) mask = mask.unsqueeze(1).expand(batch_size, self.n_heads, -1, -1) energy = energy.masked_fill(mask, float("-inf")) score = torch.softmax(energy, dim=-1) #score = [batch size, n heads, query len, key len] res = torch.matmul(self.dropout(score), V) #res = [batch size, n heads, query len, head dim] res = res.permute(0, 2, 1, 3).contiguous() #res = [batch size, query len, n heads, head dim] res = res.view(batch_size, -1, self.hidden_dim) #res = [batch size, query len, hiddden dim] res = self.fc_output(res) #res = [batch size, query len, hiddden dim] return res, score.mean(dim=1) # score [batch_size, query len, key len]
def forward(self, tensor: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: broadcast_mask = mask.unsqueeze(-1) num_elements = broadcast_mask.sum() * self.size mean = (tensor * broadcast_mask).sum() / num_elements masked_centered = (tensor - mean) * broadcast_mask std = torch.sqrt((masked_centered * masked_centered).sum() / num_elements + self.eps) return self.gamma * (tensor - mean) / (std + self.eps) + self.beta
def forward(self, features: torch.FloatTensor, mask: torch.BoolTensor, _lengths: torch.LongTensor): """ Args: features: Input features shape (batch_size, seq_len, feat_dim= mask: Input mask shape (batch_size, seq_len) _lengths: Input lengths, unused, shape (batch_size) Returns: """ # print(f"genpool input {features.shape}") _batch_size, seq_len, input_dim = features.shape # apply first FCs, one for each head # features (batch, seq_len, d_input) # weight1 (num_heads, d_input, d_head) b1 = torch.matmul(features.unsqueeze(1), self.genpool_w1_head.unsqueeze(0)) b1 += self.genpool_b1_head.unsqueeze(1).unsqueeze(0) # output (batch, num_heads, seq_len, d_head) # dropout + activation # apply nonlinear activation b1 = self.activation(self.dropout1(b1)) # apply second FCs, one for each head # weight2 (num_heads, d_head, d_head_output) b1 = torch.matmul(b1, self.genpool_w2_head.unsqueeze(0)) b1 += self.genpool_b2_head.unsqueeze(1).unsqueeze(0) # output (batch, num_heads, seq_len, d_head_output) # dropout b1 = self.dropout2(b1) # set pre-softmax activations for masked sequence elements to -inf # mask shape (batch, seq_len) b1.masked_fill_(mask.unsqueeze(1).unsqueeze(-1), -INF) # now softmax individually per head over the sequence smweights = self.softmax(b1 / self.softmax_temp) # shape (batch, num_heads, seq_len, d_head_output) # drop attentions smweights = self.dropout3(smweights) # multiply input features with softmax weights for all heads smweights = smweights.transpose(1, 2).reshape(-1, seq_len, input_dim) # shape (batch, seq_len, input_dim) # use the attention weights to pool over the sequence and done pooled = (features * smweights).sum(dim=1) # return return pooled, smweights
def forward(self, tensor: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: broadcast_mask = mask.unsqueeze(-1) num_elements = broadcast_mask.sum() * self.size mean = (tensor * broadcast_mask).sum() / num_elements masked_centered = (tensor - mean) * broadcast_mask std = torch.sqrt((masked_centered * masked_centered).sum() / num_elements + util.tiny_value_of_dtype(tensor.dtype)) return (self.gamma * (tensor - mean) / (std + util.tiny_value_of_dtype(tensor.dtype)) + self.beta)
def forward(self, tensors: List[torch.Tensor], mask: torch.BoolTensor = None) -> torch.Tensor: """ Compute a weighted average of the `tensors`. The input tensors an be any shape with at least two dimensions, but must all be the same shape. When `do_layer_norm=True`, the `mask` is required input. If the `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` of shape `(batch_size, timesteps)`. When `do_layer_norm=False` the `mask` is ignored. """ if len(tensors) != self.mixture_size: raise ConfigurationError( "{} tensors were passed, but the module was initialized to " "mix {} tensors.".format(len(tensors), self.mixture_size)) def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): tensor_masked = tensor * broadcast_mask mean = torch.sum(tensor_masked) / num_elements_not_masked variance = (torch.sum( ((tensor_masked - mean) * broadcast_mask)**2) / num_elements_not_masked) return (tensor - mean) / torch.sqrt( variance + util.tiny_value_of_dtype(variance.dtype)) normed_weights = torch.nn.functional.softmax(torch.cat( [parameter for parameter in self.scalar_parameters]), dim=0) normed_weights = torch.split(normed_weights, split_size_or_sections=1) if not self.do_layer_norm: pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * tensor) return self.gamma * sum(pieces) else: assert mask is not None broadcast_mask = mask.unsqueeze(-1) input_dim = tensors[0].size(-1) num_elements_not_masked = torch.sum(mask) * input_dim pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked)) return self.gamma * sum(pieces)
def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor = None): if self._dropout: tokens = self._dropout(tokens) # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens` # dimension. if self._pool == 'max': if mask is not None: tokens.masked_fill_(~mask.unsqueeze(-1), -19260817) ret = torch.max(tokens, dim=1)[0] return ret elif self._pool == 'sum': if mask is not None: tokens = tokens * mask.unsqueeze(-1) ret = tokens.sum(1) return ret elif self._pool == 'mean': if mask is not None: tokens = tokens * mask.unsqueeze(-1) summed = tokens.sum(1) if mask is not None: lengths = get_lengths_from_binary_sequence_mask(mask) length_mask = lengths > 0 # Set any length 0 to 1, to avoid dividing by zero. lengths = torch.max(lengths, lengths.new_ones(1)) else: lengths = tokens.new_full((1, ), fill_value=tokens.size(1)) length_mask = None summed = summed / lengths.unsqueeze(-1).float() if length_mask is not None: summed = summed * (length_mask > 0).unsqueeze(-1) return summed else: raise NotImplementedError
def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor: batch_size, seq_len, _ = inputs.size() state = torch.zeros(batch_size, self.hidden_dim, device=inputs.device) norm = torch.cat([param.flatten() for param in self.parameters()]).norm(p=2) states = [] for time in range(seq_len): inp = inputs[:, time, :] preact = self.in_proj(inp) + self.hid_proj(state) state = torch.tanh(self.scale / norm * preact) states.append(state) return torch.stack(states, dim=1) * mask.unsqueeze(dim=-1)
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.BoolTensor = None): """[summary] Parameters ---------- query : torch.Tensor [shape : (batch_size, query_len, hidden_dim)] key : torch.Tensor [shape : (batch_size, key_len, hidden_dim)] value : torch.Tensor [shape : (batch_size, key_len, hidden_dim)] mask : torch.BoolTensor [shape : (batch_size, query_len, key_len)] """ batch_size = query.shape[0] Q, K, V = self.fc_query(query), self.fc_key(key), self.fc_value(value) #query = [batch size, query len, hid dim] #key = [batch size, key len, hid dim] #value = [batch size, key len, hid dim] K = K.permute(0, 2, 1) #Q = [batch size, query len, hidden dim] #K = [batch size, hidden dim, key len] #V = [batch size, key len, hidden dim] energy = torch.matmul(Q, K) / self.scale #energy = [batch size, query len, key len] if mask is not None: # mask (batch_size, query_len, key_len) mask = mask.unsqueeze(1).expand(batch_size, query.shape[1], key.shape[1]) energy = energy.masked_fill(mask, float("-inf")) score = _hardmax(energy) #[batch size, query len, key len] res = torch.matmul(score, V) #res = [batch size, query len, hidden dim] res = self.fc_output(res) #res = [batch size, query len, hiddden dim] return res, score
def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor: """ # Parameters inputs : `torch.Tensor`, required. A tensor of shape (batch_size, timesteps, input_dim) mask : `torch.BoolTensor`, optional (default = None). A tensor of shape (batch_size, timesteps). # Returns A tensor of shape (batch_size, timesteps, output_dim). """ if mask is None: return self._feedforward(inputs) else: outputs = self._feedforward(inputs) return outputs * mask.unsqueeze(dim=-1)
def masked_softmax( vector: torch.Tensor, mask: torch.BoolTensor, dim: int = -1, memory_efficient: bool = False, ) -> torch.Tensor: if mask is None: result = torch.nn.functional.softmax(vector, dim=dim) else: while mask.dim() < vector.dim(): mask = mask.unsqueeze(1) if not memory_efficient: result = torch.nn.functional.softmax(vector * mask, dim=dim) result = result * mask result = result / ( result.sum(dim=dim, keepdim=True) + tiny_value_of_dtype(result.dtype) ) else: masked_vector = vector.masked_fill(~mask, min_value_of_dtype(vector.dtype)) result = torch.nn.functional.softmax(masked_vector, dim=dim) return result
def _greedy_decode( arc_scores: torch.Tensor, arc_tag_logits: torch.Tensor, mask: torch.BoolTensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Decodes the head and head tag predictions by decoding the unlabeled arcs independently for each word and then again, predicting the head tags of these greedily chosen arcs independently. # Parameters arc_scores : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length) used to generate a distribution over attachments of a given word to all other words. arc_tag_logits : `torch.Tensor`, required. A tensor of shape (batch_size, sequence_length, sequence_length, num_tags) used to generate a distribution over tags for each arc. mask : `torch.BoolTensor`, required. A mask of shape (batch_size, sequence_length). # Returns arc_probs : `torch.Tensor` A tensor of shape (batch_size, sequence_length, sequence_length) representing the probability of an arc being present for this edge. arc_tag_probs : `torch.Tensor` A tensor of shape (batch_size, sequence_length, sequence_length, sequence_length) representing the distribution over edge tags for a given edge. """ # Mask the diagonal, because we don't self edges. inf_diagonal_mask = torch.diag( arc_scores.new(mask.size(1)).fill_(-numpy.inf)) arc_scores = arc_scores + inf_diagonal_mask # shape (batch_size, sequence_length, sequence_length, num_tags) arc_tag_logits = arc_tag_logits + inf_diagonal_mask.unsqueeze( 0).unsqueeze(-1) # Mask padded tokens, because we only want to consider actual word -> word edges. minus_mask = ~mask.unsqueeze(2) arc_scores.masked_fill_(minus_mask, -numpy.inf) arc_tag_logits.masked_fill_(minus_mask.unsqueeze(-1), -numpy.inf) # shape (batch_size, sequence_length, sequence_length) arc_probs = arc_scores.sigmoid() # shape (batch_size, sequence_length, sequence_length, num_tags) arc_tag_probs = torch.nn.functional.softmax(arc_tag_logits, dim=-1) return arc_probs, arc_tag_probs
def forward( self, sequence_tensor: torch.FloatTensor, span_indices: torch.LongTensor, span_indices_mask: torch.BoolTensor = None, ) -> torch.FloatTensor: # shape (batch_size, sequence_length, 1) global_attention_logits = torch.matmul( sequence_tensor, torch.zeros(self.input_dim, 1).to_device(sequence_tensor.device())) # shape (batch_size, sequence_length, embedding_dim + 1) concat_tensor = torch.cat([sequence_tensor, global_attention_logits], -1) concat_output, span_mask = util.batched_span_select( concat_tensor, span_indices) print(span_mask) # Shape: (batch_size, num_spans, max_batch_span_width, embedding_dim) span_embeddings = concat_output[:, :, :, :-1] # Shape: (batch_size, num_spans, max_batch_span_width) span_attention_logits = concat_output[:, :, :, -1] # Shape: (batch_size, num_spans, max_batch_span_width) span_attention_weights = util.masked_softmax(span_attention_logits, span_mask) # Do a weighted sum of the embedded spans with # respect to the normalised attention distributions. # Shape: (batch_size, num_spans, embedding_dim) attended_text_embeddings = util.weighted_sum(span_embeddings, span_attention_weights) if span_indices_mask is not None: # Above we were masking the widths of spans with respect to the max # span width in the batch. Here we are masking the spans which were # originally passed in as padding. return attended_text_embeddings * span_indices_mask.unsqueeze(-1) return attended_text_embeddings
def forward( self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: torch.BoolTensor = None, ) -> torch.Tensor: """ Args: q (torch.Tensor): [batch size, len_q, d_model] k (torch.Tensor): [batch size, len_k, d_model] v (torch.Tensor): [batch size, len_v, d_model] mask (torch.BoolTensor, optional): [batch size, len_q, len_k]. Defaults to None. Returns: torch.Tensor: [batch size, len_q, d_v * n_head] """ batch_size, len_q, len_k, len_v = q.size(0), q.size(1), k.size( 1), v.size(1) residual = q qs = self.w_qs(q).view(batch_size, len_q, self.n_head, self.d_k) ks = self.w_ks(k).view(batch_size, len_k, self.n_head, self.d_k) vs = self.w_vs(v).view(batch_size, len_v, self.n_head, self.d_v) # output = [batch size, n_head, len_q, d_v] if mask is not None: mask = mask.unsqueeze(1).repeat(1, self.n_head, 1, 1) output, _ = self.attn( qs.transpose(1, 2), ks.transpose(1, 2), vs.transpose(1, 2), mask=mask, ) # [batch size, len_q, n_head * d_v] output = output.transpose(1, 2).reshape(batch_size, len_q, -1) output = self.layer_norm(output + residual) return output
def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor: """ # Parameters inputs : `torch.Tensor`, required. A tensor of shape (batch_size, timesteps, input_dim) mask : `torch.BoolTensor`, optional (default = `None`). A tensor of shape (batch_size, timesteps). # Returns A tensor of shape (batch_size, timesteps, output_dim), where output_dim = input_dim. """ if mask is None: return inputs else: # We should mask out the output instead of the input. # But here, output = input, so we directly mask out the input. return inputs * mask.unsqueeze(dim=-1)
def forward(self, token_embeddings: torch.Tensor, mask: torch.BoolTensor): mask_for_fill = ~mask.unsqueeze(1) if self._return_all_layers: layer_outputs: List[List[torch.Tensor]] = [[], []] else: outputs: List[torch.Tensor] = [] for k, blocks in enumerate([self._forward_residual_blocks, self._backward_residual_blocks]): out = transposed_embeddings for block in blocks: out = block(out.masked_fill(mask_for_fill, 0.0)) if self._return_all_layers: if not self._return_all_layers: if self._return_all_layers: return [] reveal_type(torch)
def get_attention_masks(self, mask: torch.BoolTensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Returns 2 masks of shape (batch_size, timesteps, timesteps) representing 1) non-padded elements, and 2) elements of the sequence which are permitted to be involved in attention at a given timestep. """ device = mask.device # Forward case: timesteps = mask.size(1) # Shape (1, timesteps, timesteps) subsequent = subsequent_mask(timesteps, device) # Broadcasted logical and - we want zero # elements where either we have padding from the mask, # or we aren't allowed to use the timesteps. # Shape (batch_size, timesteps, timesteps) forward_mask = mask.unsqueeze(-1) & subsequent # Backward case - exactly the same, but transposed. backward_mask = forward_mask.transpose(1, 2) return forward_mask, backward_mask
def masked_softmax( vector: torch.Tensor, mask: torch.BoolTensor, dim: int = -1, memory_efficient: bool = False, ) -> torch.Tensor: """ `torch.nn.functional.softmax(vector)` does not work if some elements of `vector` should be masked. This performs a softmax on just the non-masked portions of `vector`. Passing `None` in for the mask is also acceptable; you'll just get a regular softmax. `vector` can have an arbitrary number of dimensions; the only requirement is that `mask` is broadcastable to `vector's` shape. If `mask` has fewer dimensions than `vector`, we will unsqueeze on dimension 1 until they match. If you need a different unsqueezing of your mask, do it yourself before passing the mask into this function. If `memory_efficient` is set to true, we will simply use a very large negative number for those masked positions so that the probabilities of those positions would be approximately 0. This is not accurate in math, but works for most cases and consumes less memory. In the case that the input vector is completely masked and `memory_efficient` is false, this function returns an array of `0.0`. This behavior may cause `NaN` if this is used as the last layer of a model that uses categorical cross-entropy loss. Instead, if `memory_efficient` is true, this function will treat every element as equal, and do softmax over equal numbers. """ if mask is None: result = torch.nn.functional.softmax(vector, dim=dim) else: while mask.dim() < vector.dim(): mask = mask.unsqueeze(1) if not memory_efficient: # To limit numerical errors from large vector elements outside the mask, we zero these out. result = torch.nn.functional.softmax(vector * mask, dim=dim) result = result * mask result = result / ( result.sum(dim=dim, keepdim=True) + tiny_value_of_dtype(result.dtype) ) else: masked_vector = vector.masked_fill(~mask, min_value_of_dtype(vector.dtype)) result = torch.nn.functional.softmax(masked_vector, dim=dim) return result
def forward(self, key: torch.Tensor, mask: torch.BoolTensor = None): """[summary] Parameters ---------- query : torch.Tensor [shape : (batch_size, query_len, hidden_dim)] key : torch.Tensor [shape : (batch_size, key_len, hidden_dim)] mask : torch.BoolTensor [shape : (batch_size, query_len, key_len)] """ batch_size = key.shape[0] query = self.query.expand((batch_size, 1, self.hidden_dim)) Q, K = self.fc_query(query), self.fc_key(key) #query = [batch size, query len, hid dim] #key = [batch size, key len, hid dim] Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) #Q = [batch size, n heads, query len, head dim] K = K.view(batch_size, -1, self.n_heads, self.head_dim) #K = [batch size, key len, nheads, head dim] energy = torch.matmul(Q, K.permute(0, 2, 3, 1)) / self.scale #energy = [batch size, n heads, query len, key len] if mask is not None: # mask (batch_size, query_len, key_len) mask = mask.unsqueeze(1).expand(batch_size, self.n_heads, -1, -1) energy = energy.masked_fill(mask, float("-inf")) score = torch.softmax(energy, dim=-1) #score = [batch size, n heads, query len, key len] score = score.mean(dim=1) return score
def forward(self, token_embeddings: torch.Tensor, mask: torch.BoolTensor): # Convolutions need transposed input transposed_embeddings = torch.transpose(token_embeddings, 1, 2) # We need to broadcast the mask to feature dimension, # and to use masked_fill_ we need the inverse of the mask. mask_for_fill = ~mask.unsqueeze(1) if self._return_all_layers: # outputs will be [[all forward layers], [all backward layers]] layer_outputs: List[List[torch.Tensor]] = [[], []] else: # outputs will be [forward final layer, backward final layer] outputs: List[torch.Tensor] = [] for k, blocks in enumerate( [self._forward_residual_blocks, self._backward_residual_blocks]): out = transposed_embeddings # Due to zero padding for backward sequences, we need # to ensure that the input has zeros everywhere where # there isn't a mask. for block in blocks: out = block(out.masked_fill(mask_for_fill, 0.0)) if self._return_all_layers: layer_outputs[k].append(out) if not self._return_all_layers: outputs.append(out) if self._return_all_layers: return [ torch.cat([fwd, bwd], dim=1).transpose(1, 2) for fwd, bwd in zip(*layer_outputs) ] else: # Concatenate forward and backward, then transpose back return torch.cat(outputs, dim=1).transpose(1, 2)
def _calculate_edit_distance(self, output_symbols: torch.LongTensor, targets: torch.LongTensor, mask: torch.BoolTensor) -> torch.FloatTensor: batch_size, max_pred_len = output_symbols.size() _, max_len = targets.size() distances = output_symbols.new_zeros(batch_size, max_pred_len, max_len) distances[:, :, 0] = torch.arange(max_pred_len) distances[:, 0, :] = torch.arange(max_len) distances = distances.float() for i in range(1, max_pred_len): for j in range(1, max_len): diagonal = distances[:, i-1, j-1] + \ self.dsub * (output_symbols[:, i-1] != targets[:, j-1]).float() comp = torch.stack( (diagonal, distances[:, i - 1, j] + self.dins, distances[:, i, j - 1] + self.ddel), dim=-1) distances[:, i, j], _ = torch.min(comp, dim=-1) #edit_distance_mask = self._get_edit_distance_mask(mask, output_symbols) distances = distances.masked_fill(~mask.unsqueeze(1), float('inf')) return distances