def forward(self, sequence_tensor: torch.FloatTensor, span_indices: torch.LongTensor, sequence_mask: torch.LongTensor = None, span_indices_mask: torch.LongTensor = None): span_starts, span_ends = [index.squeeze(-1) for index in span_indices.split(1, dim=-1)] if span_indices_mask is not None: span_starts = span_starts * span_indices_mask.long() span_ends = span_ends * span_indices_mask.long() # The span is exclusive on the right, so the span_ends need to -1 start_embeddings = utils.batched_index_select(sequence_tensor, span_starts) inclusive_span_ends = torch.relu((span_ends - 1).float()).long() end_embeddings = utils.batched_index_select(sequence_tensor ,inclusive_span_ends) combined_tensors = torch.cat([start_embeddings, end_embeddings], dim=-1) if self._span_width_embedding is not None: # Embed the span widths and concatenate to the rest of the representations. if self._bucket_widths: span_widths = utils.bucket_values(span_ends - span_starts, num_total_buckets=self._num_width_embeddings) else: span_widths = span_ends - span_starts span_width_embeddings = self._span_width_embedding(span_widths) combined_tensors = torch.cat([combined_tensors, span_width_embeddings], dim=-1) if span_indices_mask is not None: return combined_tensors * span_indices_mask.unsqueeze(-1).float() return combined_tensors
def forward( # type: ignore self, embedded_text: TextFieldTensors, gold_labels: torch.LongTensor = []) -> Dict[str, torch.Tensor]: """ # Parameters tokens : `TextFieldTensors` From a `TextField` label : `torch.IntTensor`, optional (default = `None`) From a `LabelField` # Returns An output dictionary consisting of: - `logits` (`torch.FloatTensor`) : A tensor of shape `(batch_size, num_labels)` representing unnormalized log probabilities of the label. - `probs` (`torch.FloatTensor`) : A tensor of shape `(batch_size, num_labels)` representing probabilities of the label. - `loss` : (`torch.FloatTensor`, optional) : A scalar loss to be optimised. """ logits = self._classification_layer(embedded_text) probs = torch.nn.functional.softmax(logits, dim=-1) output_dict = {"logits": logits, "class_probabilities": probs} if gold_labels != None: output_dict['loss'] = self._loss( logits, gold_labels.long().view(-1)) * self.loss_weight for metric in self.metrics.values(): metric(logits, gold_labels) return output_dict
def _prepare_decode_step_input( self, input_indices: torch.LongTensor, decoder_hidden_state: torch.LongTensor = None, encoder_outputs: torch.LongTensor = None, encoder_outputs_mask: torch.LongTensor = None, ) -> torch.LongTensor: """ Given the input indices for the current timestep of the decoder, and all the encoder outputs, compute the input at the current timestep. Note: This method is agnostic to whether the indices are gold indices or the predictions made by the decoder at the last timestep. If we're not using attention, the output of this method is just an embedding of the input indices. If we are, the output will be a concatentation of the embedding and an attended average of the encoder inputs. Parameters ---------- input_indices : torch.LongTensor Indices of either the gold inputs to the decoder or the predicted labels from the previous timestep. decoder_hidden_state : torch.LongTensor, optional (not needed if no attention) Output of from the decoder at the last time step. Needed only if using attention. encoder_outputs : torch.LongTensor, optional (not needed if no attention) Encoder outputs from all time steps. Needed only if using attention. encoder_outputs_mask : torch.LongTensor, optional (not needed if no attention) Masks on encoder outputs. Needed only if using attention. """ input_indices = input_indices.long() # input_indices : (batch_size,) since we are processing these one timestep at a time. # (batch_size, target_embedding_dim) embedded_input = self._target_embedder(input_indices) if self._decoder_attention is not None: # encoder_outputs : (batch_size, input_sequence_length, encoder_output_dim) # Ensuring mask is also a FloatTensor. Or else the multiplication within attention will # complain. # important - need to use zero-masking instead of -inf for attention # I've checked that doing this doesn't significantly increase time # per batch, but should consider only doing once encoder_outputs.data.masked_fill_( 1 - encoder_outputs_mask.byte().data, 0.0) encoder_outputs = 0.5 * encoder_outputs encoder_outputs_mask = encoder_outputs_mask.float() encoder_outputs_mask = encoder_outputs_mask[:, :, 0] # (batch_size, input_sequence_length) attention_input = torch.cat((decoder_hidden_state, embedded_input), 1) input_weights = self._decoder_attention(attention_input, encoder_outputs, encoder_outputs_mask) # (batch_size, input_dim) attended_input = weighted_sum(encoder_outputs, input_weights) # (batch_size, input_dim + target_embedding_dim) return torch.cat((attended_input, embedded_input), -1) else: return embedded_input
def calculate_instance_loss(self, predictions: torch.FloatTensor, targets: torch.LongTensor, mode: str, as_numpy: bool = False) -> dict: """Calculate loss per instance in a batch :param predictions: Predictions (Predicted) :type predictions: torch.FloatTensor :param targets: Targets (Ground Truth) :type targets: torch.LongTensor :param mode: train/val/test :type mode: str :param as_numpy: flag to decide whether to return losses as np.ndarray :type as_numpy: bool :return: dict of losses with list of loss values per instance """ loss_config = self.model_config.get('loss')[mode] criterion = loss_factory.create(loss_config['name'], **loss_config['params']) # correct data type to handle mismatch between # CrossEntropyLoss and BCEWithLogitsLoss if loss_config['name'] == 'cross-entropy': targets = targets.long() loss = criterion(predictions, targets) if as_numpy: loss = loss.cpu().numpy() return {'loss': loss}
def forward(self, images: torch.Tensor, objects: torch.LongTensor, segms: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, question: Dict[str, torch.Tensor], question_tags: torch.LongTensor, question_mask: torch.LongTensor, answers: Dict[str, torch.Tensor], answer_tags: torch.LongTensor, answer_mask: torch.LongTensor, metadata: List[Dict[str, Any]] = None, label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: """ :param images: [batch_size, 3, im_height, im_width] :param objects: [batch_size, max_num_objects] Padded objects :param boxes: [batch_size, max_num_objects, 4] Padded boxes :param box_mask: [batch_size, max_num_objects] Mask for whether or not each box is OK :param question: AllenNLP representation of the question. [batch_size, num_answers, seq_length] :param question_tags: A detection label for each item in the Q [batch_size, num_answers, seq_length] :param question_mask: Mask for the Q [batch_size, num_answers, seq_length] :param answers: AllenNLP representation of the answer. [batch_size, num_answers, seq_length] :param answer_tags: A detection label for each item in the A [batch_size, num_answers, seq_length] :param answer_mask: Mask for the As [batch_size, num_answers, seq_length] :param metadata: Ignore, this is about which dataset item we're on :param label: Optional, which item is valid """ features = self.trunk.forward( images, objects, segms, boxes, box_mask, question, question_tags, question_mask, answers, answer_tags, answer_mask, ) logits = self.final_mlp(features['pooled_rep']).squeeze(2) class_probabilities = F.softmax(logits, dim=-1) output_dict = { 'label_logits': logits, 'label_probs': class_probabilities, 'cnn_regularization_loss': features['cnn_regularization_loss'], # Uncomment to visualize attention, if you want # 'qa_attention_weights': features['qa_attention_weights'], # 'atoo_attention_weights': features['atoo_attention_weights'], } if label is not None: loss = self._loss(logits, label.long().view(-1)) self._accuracy(logits, label) output_dict["loss"] = loss[None] return output_dict
def sequence_ctc_loss_with_logits( logits: torch.FloatTensor, logit_mask: Union[torch.FloatTensor, torch.BoolTensor], targets: torch.LongTensor, target_mask: Union[torch.FloatTensor, torch.BoolTensor], blank_index: torch.LongTensor ) -> torch.FloatTensor: # lengths : (batch_size, ) # calculated by counting number of mask logit_lengths = (logit_mask.bool()).long().sum(1) target_lengths = (target_mask.bool()).long().sum(1) # log_logits : (T, batch_size, n_class), this kind of shape is required for ctc_loss #log_logits = logits + (logit_mask.unsqueeze(-1) + 1e-45).log() log_logits = logits.log_softmax(-1).transpose(0, 1) targets = targets.long() loss = F.ctc_loss(log_logits, targets, logit_lengths, target_lengths, blank=blank_index, reduction='mean') if (logit_lengths < target_lengths).sum() > 0: print("The length of predicted alignment is shoter than target length, increase upsample factor.") raise Exception return loss
def forward( self, tokens: Dict[str, torch.LongTensor], input_mask: torch.LongTensor, tags: torch.LongTensor = None, labels: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, # pylint: disable=unused-argument **kwargs) -> Dict[str, torch.Tensor]: transformed_tokens = self._text_field_embedder(tokens) first_token_tensor = transformed_tokens[:, 0, :] encoded_text = transformed_tokens[:, 1:, :] pooled_output = self.dropout( torch.tanh(self._feedforward(first_token_tensor))) tag_logits = self._tag_feedforward(encoded_text) mask = input_mask[:, 1:].long() best_paths = self.crf.viterbi_tags(tag_logits, mask) intent_logits = self._intent_feedforward(pooled_output) intent_probs = torch.nn.functional.softmax(intent_logits, dim=-1) # Just get the tags and ignore the score. predicted_tags = [x for x, y in best_paths] output = { 'tag_logits': tag_logits, 'mask': input_mask, 'tags': predicted_tags, 'intent_probs': intent_probs } if tags is not None: # Add negative log-likelihood as loss tags = tags[:, 1:] log_likelihood = self.crf(tag_logits, tags, mask) output["slot_loss"] = -log_likelihood # Represent viterbi tags as "class probabilities" that we can # feed into the metrics class_probabilities = tag_logits * 0. for i, instance_tags in enumerate(predicted_tags): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 mask = mask.float() # for metric in self.metrics.values(): # metric(class_probabilities, tags.contiguous(), mask) if self.calculate_span_f1: self._f1_metric(class_probabilities, tags, mask) if labels is not None: output["intents_loss"] = self._intent_loss(intent_logits, labels.long().view(-1)) self._intent_accuracy(intent_logits, labels) self._intent_accuracy_3(intent_logits, labels) if metadata is not None: output["words"] = [x["words"] for x in metadata] if 'slot_loss' in output and 'intents_loss' in output: output["loss"] = output["slot_loss"] + output["intents_loss"] elif 'slot_loss' in output: output["loss"] = output["slot_loss"] elif 'intents_loss' in output: output["loss"] = output["intents_loss"] return output
def column_gather(y_out: torch.FloatTensor, x_lengths: torch.LongTensor) -> torch.FloatTensor: x_lengths = x_lengths.long().detach().cpu().numpy() - 1 out = [] for batch_index, column_index in enumerate(x_lengths): out.append(y_out[batch_index, column_index]) return torch.stack(out)
def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0): """`input_ids_shape` is expected to be [bsz x seqlen].""" attention_mask = attention_mask.long() # create positions depending on attention_mask positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1 # cut positions if `past_key_values_length` is > 0 positions = positions[:, past_key_values_length:] return super().forward(positions + self.offset)
def forward(self, text: Dict[str, torch.LongTensor], predicate_indicator: torch.LongTensor, labeled_spans: torch.LongTensor, **kwargs): span_mask = (labeled_spans[:, :, 0] >= 0).long() span_slot_labels = [] for i, n in enumerate(self.slot_labels): if 'span_slot_%s'%n in kwargs and kwargs['span_slot_%s'%n] is not None: span_slot_labels.append(kwargs['span_slot_%s'%n] * span_mask) if len(span_slot_labels) == 0: span_slot_labels = None embedded_text_input = self.embedding_dropout(self.text_field_embedder(text)) mask = get_text_field_mask(text) embedded_predicate_indicator = self.predicate_feature_embedding(predicate_indicator.long()) embedded_text_with_predicate_indicator = torch.cat([embedded_text_input, embedded_predicate_indicator], -1) batch_size, sequence_length, embedding_dim_with_predicate_feature = embedded_text_with_predicate_indicator.size() if self.stacked_encoder.get_input_dim() != embedding_dim_with_predicate_feature: raise ConfigurationError("The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_predicate_indicator, mask) span_reps = self.span_extractor(encoded_text, labeled_spans, sequence_mask=mask, span_indices_mask = span_mask) output_dict = {} slot_logits = self.question_generator(span_reps, slot_labels=span_slot_labels) for i, n in enumerate(self.slot_labels): # Replace scores for padding and unk slot_logits[i][:,:,0:2] -= 9999999 output_dict["slot_logits_%s"%n] = slot_logits[i] loss = None if span_slot_labels is not None: for i, n in enumerate(self.slot_labels): slot_loss = sequence_cross_entropy_with_logits(slot_logits[i], span_slot_labels[i], span_mask.float()) if loss is None: loss = slot_loss else: loss += slot_loss self.question_metric(slot_logits, span_slot_labels, labeled_spans, mask=span_mask, sequence_mask=mask) output_dict["loss"] = loss output_dict['span_mask'] = span_mask return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.projection_layer(encoded_text) class_probabilities = F.softmax(logits) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if label is not None: loss = self._loss(logits, label.long().view(-1)) output_dict["loss"] = loss self._accuracy(logits, label.squeeze(-1)) return output_dict
def forward( self, title: Dict[str, torch.LongTensor], abstract: Dict[str, torch.LongTensor], md: MetadataField, label: torch.LongTensor = None, label_true: torch.FloatTensor = None) -> Dict[str, torch.Tensor]: embedded_abstract = self.text_field_embedder(abstract) embedded_title = self.text_field_embedder(title) title_mask = util.get_text_field_mask(title) abstract_mask = util.get_text_field_mask(abstract) encoded_title = self.title_encoder(embedded_title, title_mask) encoded_abstract = self.abstract_encoder(embedded_abstract, abstract_mask) logits = self.classifier_feedforward( torch.cat([encoded_title, encoded_abstract], dim=-1)) if not self.pu_loss: if self.positive_class == 1: logits = torch.cat(((-logits).view(-1, 1), logits.view(-1, 1)), dim=1) else: logits = torch.cat((logits.view(-1, 1), (-logits).view(-1, 1)), dim=1) class_probabilities = F.softmax(logits, dim=1) else: positive_pred = self.normalize(logits) negative_pred = 1 - positive_pred if self.positive_class == 1: class_probabilities = torch.cat( (negative_pred.view(-1, 1), positive_pred.view(-1, 1)), dim=1) else: class_probabilities = torch.cat( (positive_pred.view(-1, 1), negative_pred.view(-1, 1)), dim=1) output_dict = {"class_probabilities": class_probabilities} if label is not None: loss = self.loss(logits, label) for metric in self.metrics.values(): metric(class_probabilities, label.long()) output_dict["loss"] = loss return output_dict
def _encode(self, source_tokens: Dict[str, torch.Tensor], verb_indicator: torch.LongTensor, lang_indicator: torch.LongTensor) -> Dict[str, torch.Tensor]: """ Encode source input sentences. """ # shape: (batch_size, max_input_sequence_length, encoder_input_dim) embedded_input = self._source_embedder(source_tokens) if self._binary_feature_embedding: embedded_verb_indicator = self._binary_feature_embedding( verb_indicator.long()) embedded_input = torch.cat( [embedded_input, embedded_verb_indicator], -1) if self._language_embedding: embedded_lang_indicator = self._language_embedding( lang_indicator.long()) # print("ENC", embedded_input.size(), embedded_lang_indicator.size()) embedded_input = torch.cat( [embedded_input, embedded_lang_indicator], -1) # shape: (batch_size, max_input_sequence_length) source_mask = util.get_text_field_mask(source_tokens) # shape: (batch_size, max_input_sequence_length, encoder_output_dim) encoder_outputs = self._encoder(embedded_input, source_mask) return {"source_mask": source_mask, "encoder_outputs": encoder_outputs}
def forward( self, tokens: Dict[str, torch.LongTensor], input_mask: torch.LongTensor, segment_ids: torch.LongTensor, next_sentence_labels: torch.FloatTensor, masked_lm_positions: torch.LongTensor, masked_lm_weights: torch.LongTensor, masked_lm_labels: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: embedded_tokens = self._text_field_embedder(tokens) transformed_tokens = self._transformer(embedded_tokens, input_mask, segment_ids) first_token_tensor = transformed_tokens[:, 0, :] pooled_output = torch.tanh(self._feedforward(first_token_tensor)) output_dict = { 'encoded_layer': transformed_tokens, 'pooled_output': pooled_output } embedding_table = self._text_field_embedder.get_embedding_by_name( 'tokens') masked_lm_loss = None next_sentence_loss = None if masked_lm_labels is not None: (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( self._use_fp16, transformed_tokens, self._norm_layer, self._vocab_bias, self._masked_lm_feedforward, embedding_table, masked_lm_positions.long(), masked_lm_labels['tokens'], masked_lm_weights) output_dict['masked_lm_loss'] = masked_lm_loss output_dict['masked_lm_example_loss'] = masked_lm_example_loss output_dict['masked_lm_log_probs'] = masked_lm_log_probs self._masked_lm_accuracy(masked_lm_log_probs.float(), masked_lm_labels["tokens"].view(-1)) if next_sentence_labels is not None: (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( self._use_fp16, pooled_output, self._next_sentence_feedforward, next_sentence_labels) output_dict['next_sentence_loss'] = next_sentence_loss output_dict[ 'next_sentence_example_loss'] = next_sentence_example_loss output_dict['next_sentence_log_probs'] = next_sentence_log_probs self._next_sentence_accuracy(next_sentence_log_probs.float(), next_sentence_labels) output_dict["loss"] = masked_lm_loss return output_dict
def forward( self, # type: ignore left: Dict[str, torch.LongTensor], right: Dict[str, torch.Tensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: """ Parameters ---------- tokens : Dict[str, Variable], required The output of ``TextField.as_array()``. sentence_per_document : Dict[str, torch.Tensor], required The number of sentences for each document. word_per_sentence : Dict[str, torch.Tensor], required The number of words for each sentence in each document. label : Variable, optional (default = None) A variable representing the label for each instance in the batch. Returns ------- An output dictionary consisting of: class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_classes)`` representing a distribution over the label classes for each instance. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ left_embedded = self.text_field_embedder(left) left_mask = util.get_text_field_mask(left) right_embedded = self.text_field_embedder(right) right_mask = util.get_text_field_mask(right) v_l = self.left_encoder(left_embedded, left_mask) v_r = self.right_encoder(right_embedded, right_mask) loss = self.loss(v_l, v_r, label) sim = F.cosine_similarity(v_l, v_r) > self.prediction_threshold output_dict = {'loss': loss} for metric in self.metrics.values(): #logging.info(f"Sim {sim}") #logging.info(f"Label {label}") metric(sim.long(), label.long()) return output_dict
def forward(self, images: torch.Tensor, objects: torch.LongTensor, segms: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, question: Dict[str, torch.Tensor], question_tags: torch.LongTensor, question_mask: torch.LongTensor, answers: Dict[str, torch.Tensor], answer_tags: torch.LongTensor, answer_mask: torch.LongTensor, metadata: List[Dict[str, Any]] = None, label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: features = self.trunk.forward( images, objects, segms, boxes, box_mask, question, question_tags, question_mask, answers, answer_tags, answer_mask, ) probs = features['probs'] output_dict = { 'label_probs': features['probs'], 'cnn_regularization_loss': features['cnn_regularization_loss'], # Uncomment to visualize attention, if you want # 'qa_attention_weights': features['qa_attention_weights'], # 'atoo_attention_weights': features['atoo_attention_weights'], } if label is not None: self._accuracy(probs.argmax(dim=1), label) # We use NLLLoss as don't have the logits. # Need to take log(softmax_probs) first. loss = self._loss(torch.log(probs), label.long().view(-1)) output_dict["loss"] = loss[None] return output_dict
def forward(self, embedded_tokens: torch.FloatTensor, input_mask: torch.LongTensor, segment_ids: torch.LongTensor = None): # pylint: disable=arguments-differ embedded_tokens = embedded_tokens * self.embed_scale embedded_tokens = common_attention.embedding_postprocessor( embedded_tokens, input_mask.long(), self._use_fp16, token_type_ids=segment_ids, use_token_type=self._use_token_type, token_type_embedding=self._token_type_embedding, use_position_embeddings=self._use_position_embeddings, position_embedding=self._position_embedding, norm_layer=self._norm_layer, dropout=self._dropout) encoder_self_attention_bias = common_attention.create_attention_mask_from_input_mask( embedded_tokens, input_mask, self._use_fp16) encoder_padding_mask = input_mask.eq(0) if not encoder_padding_mask.any(): encoder_padding_mask = None prev_output = embedded_tokens for (attention, feedforward_output, feedforward, feedforward_intemediate, layer_norm_output, layer_norm) in zip( self._attention_layers, self._feedforward_output_layers, self._feedforward_layers, self._feedforward_intermediate_layers, self._layer_norm_output_layers, self._layer_norm_layers): layer_input = prev_output attention_output = attention(layer_input, encoder_self_attention_bias, key_padding_mask=encoder_padding_mask) attention_output = self._dropout( feedforward_output(attention_output)) attention_output = layer_norm_output(attention_output + layer_input) attention_intermediate = self._activation( feedforward_intemediate(attention_output)) layer_output = self._dropout(feedforward(attention_intermediate)) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output return prev_output
def forward(self, premise_img: torch.Tensor, hypothesis: Dict[str, torch.Tensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: """ :param premise_img: :param hypothesis: :param label: :return: """ embedded_hypothesis = self._text_field_embedder(hypothesis) hypothesis_mask = get_text_field_mask(hypothesis).float() if self.rnn_input_dropout: embedded_hypothesis = self.rnn_input_dropout(embedded_hypothesis) encoded_hypothesis = self._encoder(embedded_hypothesis, hypothesis_mask) hypothesis_hidden_state = get_final_encoder_states( encoded_hypothesis, hypothesis_mask, self._encoder.is_bidirectional() ) img_feats = self.detector(premise_img) fused_features = torch.cat((img_feats, hypothesis_hidden_state), dim=-1) label_logits = self._output_feedforward(fused_features) label_probs = nn.functional.softmax(label_logits, dim=-1) output_dict = { "label_logits": label_logits, "label_probs": label_probs } if label is not None: loss = self._loss(label_logits, label.long().view(-1)) self._accuracy(label_logits, label) output_dict["loss"] = loss return output_dict
def _get_permutation_indices(mask: torch.LongTensor): """ Get the index for sorting with the length of the sequences. Empty sequences will be removed, but later restored with `restoration_idx`. """ seq_lens = mask.long().sum(-1) sorted_seq_lens, perm_idx = seq_lens.sort(descending=True) # remove empty sequences num_non_zero_seqs = len(seq_lens.nonzero()) truncated_sorted_seq_lens = sorted_seq_lens[:num_non_zero_seqs] truncated_perm_idx = perm_idx[:num_non_zero_seqs] # compute restoration index to sort tensors into the original order later. _, restoration_idx = perm_idx.sort() return truncated_perm_idx, truncated_sorted_seq_lens, restoration_idx
def _embed_source(self, source_tokens: Dict[str, torch.Tensor], source_entity_length: torch.LongTensor): """ :param source_tokens :param source_entity_length: (batch_size, max_token_num) :return (batch_size, max_token_num, embedding_dim) """ token_ids = source_tokens['tokens'] embedded = self._source_embedding(token_ids) batched_embedded = list() embedding_dim = embedded.size(-1) batch_size, max_token_num = source_entity_length.size() for _embedded, _length in zip(embedded, source_entity_length.long()): merged_embedded_input = list() idx = 0 for length in _length: if length > 0: embedding = torch.mean(_embedded[idx:idx + length, :], dim=0) merged_embedded_input.append(embedding) idx += length else: break merged_embedded_input = torch.stack(merged_embedded_input, dim=0) pad_num = max_token_num - merged_embedded_input.size(0) if pad_num > 0: merged_embedded_input = torch.cat( (merged_embedded_input, merged_embedded_input.new_zeros([pad_num, embedding_dim ])), dim=0) batched_embedded.append(merged_embedded_input) # shape: (batch_size, max_token_num, embedding_dim) batched_embedded = torch.stack(batched_embedded, dim=0) assert batched_embedded.size(0) == embedded.size( 0) and batched_embedded.size(1) == source_entity_length.size(1) # TODO: Dropout return batched_embedded
def _decoder_step( self, last_predictions: torch.Tensor, selective_weights: torch.Tensor, lang_indicator: torch.LongTensor, state: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: # shape: (group_size, max_input_sequence_length, encoder_output_dim) encoder_outputs_mask = state["source_mask"].float() # shape: (group_size, target_embedding_dim) embedded_input = self._target_embedder(last_predictions) if self._language_embedding: embedded_lang_indicator = self._language_embedding( lang_indicator.long()) # print("DEC", embedded_input.size(), embedded_lang_indicator.size()) if len(embedded_lang_indicator.size()) == 3: if embedded_lang_indicator.size(1) == embedded_input.size(0): embedded_lang_indicator = embedded_lang_indicator[0] else: embedded_lang_indicator = embedded_lang_indicator.view( embedded_input.size(0), -1) embedded_input = torch.cat( [embedded_input, embedded_lang_indicator], -1) # print("DEC2", embedded_input.size(), embedded_lang_indicator.size()) # shape: (group_size, max_input_sequence_length) attentive_weights = self._attention(state["decoder_hidden"], state["encoder_outputs"], encoder_outputs_mask) # shape: (group_size, encoder_output_dim) attentive_read = util.weighted_sum(state["encoder_outputs"], attentive_weights) # shape: (group_size, encoder_output_dim) selective_read = util.weighted_sum(state["encoder_outputs"][:, 1:-1], selective_weights) # shape: (group_size, target_embedding_dim + encoder_output_dim * 2) decoder_input = torch.cat( (embedded_input, attentive_read, selective_read), -1) # shape: (group_size, decoder_input_dim) projected_decoder_input = self._input_projection_layer(decoder_input) state["decoder_hidden"], state["decoder_context"] = self._decoder_cell( projected_decoder_input, (state["decoder_hidden"], state["decoder_context"])) return state
def forward(self, # type: ignore text: Dict[str, torch.LongTensor], predicate_indicator: torch.LongTensor, labeled_spans: torch.LongTensor = None, annotations: Dict = None, **kwargs): embedded_text_input = self.embedding_dropout(self.text_field_embedder(text)) mask = get_text_field_mask(text) embedded_predicate_indicator = self.predicate_feature_embedding(predicate_indicator.long()) embedded_text_with_predicate_indicator = torch.cat([embedded_text_input, embedded_predicate_indicator], -1) batch_size, sequence_length, embedding_dim_with_predicate_feature = embedded_text_with_predicate_indicator.size() if self.stacked_encoder.get_input_dim() != embedding_dim_with_predicate_feature: raise ConfigurationError("The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_predicate_indicator, mask) span_hidden, span_mask = self.span_hidden(encoded_text, encoded_text, mask, mask) logits = self.pred(F.relu(span_hidden)).squeeze() probs = F.sigmoid(logits) * span_mask.float() output_dict = {"logits": logits, "probs": probs, 'span_mask': span_mask} if labeled_spans is not None: span_label_mask = (labeled_spans[:, :, 0] >= 0).squeeze(-1).long() prediction_mask = self.get_prediction_map(labeled_spans, span_label_mask, sequence_length, annotations=annotations) loss = F.binary_cross_entropy_with_logits(logits, prediction_mask, weight=span_mask.float(), size_average=False) output_dict["loss"] = loss if not self.training: spans = self.to_scored_spans(probs, span_mask) self.threshold_metric(spans, annotations) # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask return output_dict
def forward(self, text: Dict[str, torch.LongTensor], predicate_indicator: torch.LongTensor): # Shape: batch_size, num_tokens, embedding_dim embedded_text_input = self._embedding_dropout( self._text_field_embedder(text)) # Shape: batch_size, num_tokens ? text_mask = get_text_field_mask(text) if self._predicate_feature_dim > 0: # Shape: batch_size, num_tokens, predicate_feature_dim ? embedded_predicate_indicator = self._predicate_feature_embedding( predicate_indicator.long()) # Shape: batch_size, num_tokens, embedding_dim + predicate_feature_dim full_embedded_text = torch.cat( [embedded_text_input, embedded_predicate_indicator], -1) else: full_embedded_text = embedded_text_input if self._stacked_encoder is not None: # Shape: batch_size, num_tokens, encoder_output_dim encoded_text = self._stacked_encoder(full_embedded_text, text_mask) else: encoded_text = full_embedded_text return encoded_text, text_mask
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size( ) encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if tags is not None: loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric: self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) if metadata is not None: output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict
def paste(background: Tensor, patch: Tensor, x: LongTensor, y: LongTensor, mask: Optional[Tensor] = None): """ Pastes the given patch into the background image tensor at the specified location. Optionally a mask of the same size as the patch can be passed in to blend the pasted contents with the background. :param background: A batch of image tensors of shape (B, C, H, W) that represent the background :param patch: A batch of image tensors of shape (B, C, h, w) which values get pasted into the background :param x: The horizontal integer coordinates relative to the top left corner of the background image. This tensor must be a one-dimensional tensor of shape (B, ). :param y: The vertical integer coordinates relative to the top left corner of the background image. This tensor must be a one-dimensional tensor of shape (B, ). :param mask: A mask of the same size as the patch that is used to blend foreground and background values. It is optional and defaults to ones (all is foreground). :return: The composite tensor of background and foreground values of shape (B, C, H, W). Note: 1. The X- and Y-coordinates can exceed the range of the background image (negative and positive). The background will be dynamically padded and cropped again after pasting such that the contents can go over the borders of the background image. 2. Currently it only supports integer locations. 3. All tensors must be on the same device. """ # background: (B, C, H, W) # patch, mask: (B, C, h, w) # x, y: (B, ) b, c, H, W = background.shape _, _, h, w = patch.shape mask = torch.ones_like(patch) if mask is None else mask device = background.device assert b == patch.size(0) == mask.size(0) assert b == x.size(0) == y.size(0) assert c == patch.size(1) == mask.size(1) assert h == mask.size(-2) assert w == mask.size(-1) assert 1 == x.ndimension() == y.ndimension() assert device == patch.device == x.device == y.device == mask.device x = x.long() y = y.long() # dynamically pad background for patches that go over borders left = min(x.min().abs().item(), 0) top = min(y.min().abs().item(), 0) right = max(x.max().item() + w - W, 0) bottom = max(y.max().item() + h - H, 0) background = nn.functional.pad(background, pad=[left, right, top, bottom]) # generate indices gridb, gridc, gridy, gridx = torch.meshgrid( torch.arange(b, device=device), torch.arange(c, device=device), torch.arange(h, device=device), torch.arange(w, device=device) ) x = x.view(b, 1, 1, 1).repeat(1, c, h, w) y = y.view(b, 1, 1, 1).repeat(1, c, h, w) x = x + gridx + left y = y + gridy + top # we need to ignore negative indices, or pasted conent will be rolled to the other side mask = mask * (x >= 0) * (y >= 0) # paste one = torch.tensor(1, dtype=mask.dtype) background[(gridb, gridc, y, x)] = mask * patch + (one - mask) * background[(gridb, gridc, y, x)] # crop away the padded regions background = background[..., top:(top + H), left:(left + W)] return background
def forward( self, # type: ignore label_indices: torch.LongTensor, token_representations: torch.FloatTensor = None, raw_tokens: List[List[str]] = None, labels: torch.LongTensor = None, **kwargs) -> Dict[str, torch.Tensor]: """ If ``token_representations`` is provided, ``tokens`` is not required. If ``token_representations`` is ``None``, then ``tokens`` is required. Parameters ---------- label_indices : torch.LongTensor A LongTensor of shape (batch_size, max_num_adpositions) with the tokens to predict a label for for each element (sentence) in the batch. token_representations : torch.FloatTensor, optional (default = None) A tensor of shape (batch_size, sequence_length, representation_dim) with the represenatation of the first token. If None, we use a contextualizer within this model to produce the token representation. raw_tokens : List[List[str]], optional (default = None) A batch of lists with the raw token strings. Used to compute token_representations, if either are None. labels : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_label_indices)``. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing unnormalized log probabilities of the classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing a distribution of the tag classes. loss : torch.FloatTensor, optional A scalar loss to be optimized. """ # Convert to LongTensor # TODO: add PR to ArrayField to preserve array types. label_indices = label_indices.long() if token_representations is None: if self._contextualizer is None: raise ConfigurationError( "token_representation not provided as input to the model, and no " "contextualizer was specified. Either add a contextualizer to your " "dataset reader (preferred if your contextualizer is frozen) or to " "this model (if you wish to train your contextualizer).") if raw_tokens is None: raise ValueError( "Input raw_tokens is ``None`` --- make sure to set " "include_raw_tokens in the DatasetReader to True.") if label_indices is None: raise ValueError("Did not recieve any token indices, needed " "if the contextualizer is within the model.") # Convert contextualizer output into a tensor # Shape: (batch_size, max_seq_len, representation_dim) token_representations, _ = pad_contextualizer_output( self._contextualizer(raw_tokens)) # Move token representation to the same device as the # module (CPU or CUDA). TODO(nfliu): This only works if the module # is on one device. device = next(self._decoder._linear_layers[0].parameters()).device token_representations = token_representations.to(device) text_mask = get_text_mask_from_representations(token_representations) text_mask = text_mask.to(device) label_mask = self._get_label_mask_from_label_indices(label_indices) label_mask = label_mask.to(device) # Mask out the -1 padding in the label_indices, since that doesn't # work with indexing. Note that we can't 0 pad because 0 is actually # a valid label index, so we pad with -1 just for the purposes of # proper mask calculation and then convert to 0-padding by applying # the mask. label_indices = label_indices * label_mask # Encode the token representation. encoded_token_representations = self._encoder(token_representations, text_mask) batch_size = label_indices.size(0) # Index into the encoded_token_representations to get tensors corresponding # to the representations of the tokens to predict labels for. # Shape: (batch_size, num_label_indices, representation_dim) range_vector = get_range_vector( batch_size, get_device_of(label_indices)).unsqueeze(1) selected_token_representations = encoded_token_representations[ range_vector, label_indices] selected_token_representations = selected_token_representations.contiguous( ) # Decode out a label from the token representation # Shape: (batch_size, num_label_indices, num_classes) logits = self._decoder(selected_token_representations) class_probabilities = F.softmax(logits, dim=-1) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if labels is not None: loss = sequence_cross_entropy_with_logits( logits, labels, label_mask, average=self.loss_average) for name, metric in self.metrics.items(): # When not running in error analysis mode, skip # metrics that start with "_" if not self.error_analysis and name.startswith("_"): continue metric(logits, labels, label_mask.float()) output_dict["loss"] = loss return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, target_index: torch.LongTensor, span_starts: torch.LongTensor, span_ends: torch.LongTensor, span_mask: torch.LongTensor, constituents: torch.LongTensor = None, tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. bio : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` tags: shape ``(batch_size, num_spans)`` span_starts: shape ``(batch_size, num_spans)`` span_ends: shape ``(batch_size, num_spans)`` Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ self.batch += 1 embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) batch_size = embedded_text_input.size(0) text_mask = util.get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size( )[2] if self.stacked_encoder.get_input_dim( ) != embedding_dim_with_binary_feature: raise ConfigurationError( "The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator, text_mask) span_starts = F.relu(span_starts.float()).long().view(batch_size, -1) span_ends = F.relu(span_ends.float()).long().view(batch_size, -1) target_index = F.relu(target_index.float()).long().view(batch_size) # shape (batch_size, sequence_length * max_span_width, embedding_dim) span_embeddings = span_srl_util.compute_span_representations( self.max_span_width, encoded_text, target_index, span_starts, span_ends, self.span_width_embedding, self.span_direction_embedding, self.span_distance_embedding, self.span_distance_bin, self.head_scorer) span_scores = self.span_feedforward(span_embeddings) srl_logits = self.srl_arg_projection_layer(span_scores) constit_logits = self.constit_arg_projection_layer(span_scores) output_dict = { "srl_logits": srl_logits, "constit_logits": constit_logits, "mask": text_mask } tags = tags.view(batch_size, -1, self.max_span_width) constituents = constituents.view(batch_size, -1, self.max_span_width) # Viterbi decoding if not self.training or (self.training and not self.fast_mode): srl_prediction, srl_probabilities = self.semi_crf.viterbi_tags( srl_logits, text_mask) output_dict["srl_tags"] = srl_prediction output_dict["srl_tag_probabilities"] = srl_probabilities self.metrics["srl"](predictions=srl_prediction.view( batch_size, -1, self.max_span_width), gold_labels=tags, mask=text_mask) reshaped_constit_logits = constit_logits.view( -1, self.num_constit_tags) constit_probabilities = F.softmax(reshaped_constit_logits, dim=-1) constit_predictions = constit_probabilities.max(-1)[1] output_dict["constit_tags"] = constit_predictions output_dict["constit_probabilities"] = constit_probabilities constit_predictions = constit_predictions.view( batch_size, -1, self.max_span_width) self.metrics["constituents"](predictions=constit_predictions, gold_labels=constituents, mask=text_mask) # Loss computation if self.training or (not self.training and not self.fast_mode): if tags is not None: srl_log_likelihood, _ = self.semi_crf(srl_logits, tags, mask=text_mask) output_dict["srl_loss"] = -srl_log_likelihood if constituents is not None: # Flattening it out. constituents = constituents.view(batch_size, -1) constit_loss = util.sequence_cross_entropy_with_logits( constit_logits, constituents, span_mask) output_dict["constit_loss"] = constit_loss if tags is not None and constituents is not None: if self.batch > self.cutoff_batch: output_dict["loss"] = - srl_log_likelihood + self.mixing_ratio * \ constit_loss else: output_dict["loss"] = -srl_log_likelihood if self.fast_mode and not self.training: output_dict["loss"] = Variable(torch.FloatTensor([0.00])) return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size( ) if self.stacked_encoder.get_input_dim( ) != embedding_dim_with_binary_feature: raise ConfigurationError( "The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities, "encoded_text": encoded_text } if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask) self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_span: torch.LongTensor, entity_span: torch.LongTensor, state_change_type_labels: torch.LongTensor = None, state_change_tags: torch.LongTensor = None ) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_span: torch.LongTensor, required. An integer ``SequenceLabelField`` representation of the position of the focus verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that pre-processing stage could not extract a verbal predicate. entity_span: torch.LongTensor, required. An integer ``SequenceLabelField`` representation of the position of the focus entity in the sentence. This should have shape (batch_size, num_tokens) state_change_type_labels: torch.LongTensor, optional (default = None) A torch tensor representing the state change type class labels of shape ``(batch_size, 1)??? state_change_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` In the first implementation we focus only on state_change_types. Returns ------- An output dictionary consisting of: type_probs : torch.FloatTensor A tensor of shape ``(batch_size, num_state_change_types)`` representing a distribution of state change types per datapoint. tags_class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_state_change_types, num_tokens)`` representing a distribution of location tags per token in a sentence. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ # Layer 1 = Word + Character embedding layer embedded_sentence = self.text_field_embedder(tokens) mask = get_text_field_mask(tokens).float() # Layer 2 = Add positional bit to encode position of focus verb and entity embedded_sentence_verb_entity = \ torch.cat([embedded_sentence, verb_span.float().unsqueeze(-1), entity_span.float().unsqueeze(-1)], dim=-1) # Layer 3 = Contextual embedding layer using Bi-LSTM over the sentence contextual_embedding = self.seq2seq_encoder( embedded_sentence_verb_entity, mask) # Layer 4: Attention (Contextual embedding, BOW(verb span)) verb_weight_matrix = verb_span.float() / ( verb_span.float().sum(-1).unsqueeze(-1) + 1e-13) verb_vector = weighted_sum( contextual_embedding * verb_span.float().unsqueeze(-1), verb_weight_matrix) entity_weight_matrix = entity_span.float() / ( entity_span.float().sum(-1).unsqueeze(-1) + 1e-13) entity_vector = weighted_sum( contextual_embedding * entity_span.float().unsqueeze(-1), entity_weight_matrix) verb_entity_vector = torch.cat([verb_vector, entity_vector], 1) batch_size, sequence_length, binary_feature_dim = verb_span.float( ).unsqueeze(-1).size() # attention weights for type prediction attention_weights_types = self.attention_layer(verb_entity_vector, contextual_embedding) attention_output_vector = weighted_sum(contextual_embedding, attention_weights_types) # contextual embedding + positional vectors for tag prediction context_positional_tags = torch.cat([ contextual_embedding, verb_span.float().unsqueeze(-1), entity_span.float().unsqueeze(-1) ], dim=-1) # Layer 5 = Dense softmax layer to pick one state change type per datapoint, # and one tag per word in the sentence type_logits = self.aggregate_feedforward(attention_output_vector) type_probs = torch.nn.functional.softmax(type_logits, dim=-1) tags_logits = self.tag_projection_layer(context_positional_tags) reshaped_log_probs = tags_logits.view(-1, self.num_tags) tags_class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_tags]) # Create output dictionary for the trainer # Compute loss and epoch metrics output_dict = {'type_probs': type_probs} if state_change_type_labels is not None: state_change_type_labels_loss = self._loss( type_logits, state_change_type_labels.long().view(-1)) for type_label in self.type_labels_vocab.values(): metric = self.type_f1_metrics["type_" + type_label] metric(type_probs, state_change_type_labels.squeeze(-1)) self._type_accuracy(type_probs, state_change_type_labels.squeeze(-1)) if state_change_tags is not None: state_change_tags_loss = sequence_cross_entropy_with_logits( tags_logits, state_change_tags, mask) self.span_metric(tags_class_probabilities, state_change_tags, mask) output_dict["tags_class_probabilities"] = tags_class_probabilities output_dict['loss'] = (state_change_type_labels_loss + state_change_tags_loss) return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], targets: torch.LongTensor, target_index: torch.LongTensor, span_starts: torch.LongTensor, span_ends: torch.LongTensor, tags: torch.LongTensor = None, **kwargs) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. bio : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` tags: shape ``(batch_size, num_spans)`` span_starts: shape ``(batch_size, num_spans)`` span_ends: shape ``(batch_size, num_spans)`` Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) text_mask = util.get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding(targets.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size( )[2] if self.stacked_encoder.get_input_dim( ) != embedding_dim_with_binary_feature: raise ConfigurationError( "The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator, text_mask) batch_size, num_spans = tags.size() assert num_spans % self.max_span_width == 0 tags = tags.view(batch_size, -1, self.max_span_width) span_starts = F.relu(span_starts.float()).long().view(batch_size, -1) span_ends = F.relu(span_ends.float()).long().view(batch_size, -1) target_index = F.relu(target_index.float()).long().view(batch_size) # shape (batch_size, sequence_length * max_span_width, embedding_dim) span_embeddings = span_srl_util.compute_span_representations( self.max_span_width, encoded_text, target_index, span_starts, span_ends, self.span_width_embedding, self.span_direction_embedding, self.span_distance_embedding, self.span_distance_bin, self.head_scorer) span_scores = self.span_feedforward(span_embeddings) # FN-specific parameters. fn_args = [] for extra_arg in ['frame', 'valid_frame_elements']: if extra_arg in kwargs and kwargs[extra_arg] is not None: fn_args.append(kwargs[extra_arg]) if fn_args: # FrameSRL batch. frame, valid_frame_elements = fn_args output_dict = self.compute_srl_graph( span_scores=span_scores, frame=frame, valid_frame_elements=valid_frame_elements, tags=tags, text_mask=text_mask, target_index=target_index) else: # Scaffold batch. if "span_mask" in kwargs and kwargs["span_mask"] is not None: span_mask = kwargs["span_mask"] if "parent_tags" in kwargs and kwargs["parent_tags"] is not None: parent_tags = kwargs["parent_tags"] if self.unlabeled_constits: not_a_constit = self.vocab.get_token_index( "*", self.constit_label_namespace) tags = (tags != not_a_constit).float().view( batch_size, -1, self.max_span_width) elif self.constit_label_namespace == "parent_labels": tags = parent_tags.view(batch_size, -1, self.max_span_width) elif self.np_pp_constits: tags = self.get_new_tags_np_pp(tags, batch_size) output_dict = self.compute_constit_graph(span_mask=span_mask, span_scores=span_scores, constit_tags=tags, text_mask=text_mask) if self.fast_mode and not self.training: output_dict["loss"] = Variable(torch.FloatTensor([0.00])) return output_dict
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout(self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding(verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat([embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size() encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, self.num_classes]) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric: self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) if metadata is not None: output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict