def __init__(self, config: BartConfig): super().__init__(config) base_model = BartModel(config) self.model = base_model self.register_buffer( "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
def __init__(self, tagset_size): super(RobertaForSequenceClassification, self).__init__() self.tagset_size = tagset_size # self.roberta_single= RobertaModel.from_pretrained(pretrain_model_dir) self.roberta_single = BartModel.from_pretrained(pretrain_model_dir) # self.single_hidden2tag = nn.Linear(bert_hidden_dim, tagset_size) self.single_hidden2tag = RobertaClassificationHead( bert_hidden_dim, tagset_size) self.config = BartConfig(pretrain_model_dir)
def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.register_buffer( "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)) ) self.num_cfemotions = 12 self.num_emotions = 6 self.num_sentiments = 2 self.cfemotion_head = BartClassificationHead( config.d_model, config.d_model, self.num_cfemotions, config.classif_dropout ) self.model._init_weights(self.cfemotion_head.dense) self.model._init_weights(self.cfemotion_head.out_proj) self.emotion_head = BartClassificationHead( config.d_model, config.d_model, self.num_emotions, config.classif_dropout ) self.model._init_weights(self.emotion_head.dense) self.model._init_weights(self.emotion_head.out_proj) self.sentiment_head = BartClassificationHead( config.d_model, config.d_model, self.num_sentiments, config.classif_dropout ) self.model._init_weights(self.sentiment_head.dense) self.model._init_weights(self.sentiment_head.out_proj)
def from_pretrained_multi( cls, full_model_name_or_path, full_model_config=None, final_layers=2 ): """Load either a full seq2seq model, or pre-load model from a separate encoder and decoder stacks.""" bart_config = BartConfig.from_pretrained(full_model_name_or_path) bart_config.final_layers = final_layers # initialize with random weights model = cls(bart_config) # this is actually a complete checkpoint if os.path.exists(full_model_name_or_path): ckpt = torch.load( os.path.join(full_model_name_or_path, "pytorch_model.bin"), map_location="cpu", ) model.load_state_dict(ckpt) return model # initialize scratch bart_model = BartModel.from_pretrained(full_model_name_or_path) # encoder full copy model.model.encoder.load_state_dict(bart_model.encoder.state_dict()) model.model.decoder.embed_tokens.load_state_dict( bart_model.decoder.embed_tokens.state_dict() ) model.model.decoder.embed_positions.load_state_dict( bart_model.decoder.embed_positions.state_dict() ) # excluding the last layer for ml, bl in zip(model.model.decoder.layers, bart_model.decoder.layers): ml.load_state_dict(bl.state_dict()) for ml in model.model.decoder.final_layers: ml.load_state_dict(bart_model.decoder.layers[-1].state_dict()) del bart_model return model
def __init__(self, config, project_dim: int = 0): BartModel.__init__(self, config) assert config.hidden_size > 0, "Encoder hidden_size can't be zero" self.encode_proj = nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None self.init_weights()
class BartForTokenClassification(PretrainedBartModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classif_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None, output_attentions=None, output_hidden_states=None, use_cache=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification loss (cross entropy) logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ if labels is not None: use_cache = False outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, ) x = outputs[0] # last hidden state logits = self.classification_head(x) # Prepend logits outputs = (logits, ) + outputs[ 1:] # Add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.config.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)) loss = F.cross_entropy(active_logits, active_labels) else: loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) outputs = (loss, ) + outputs return outputs
class BartForMultitaskLearning(PretrainedBartModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.register_buffer( "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)) ) self.num_cfemotions = 12 self.num_emotions = 6 self.num_sentiments = 2 self.cfemotion_head = BartClassificationHead( config.d_model, config.d_model, self.num_cfemotions, config.classif_dropout ) self.model._init_weights(self.cfemotion_head.dense) self.model._init_weights(self.cfemotion_head.out_proj) self.emotion_head = BartClassificationHead( config.d_model, config.d_model, self.num_emotions, config.classif_dropout ) self.model._init_weights(self.emotion_head.dense) self.model._init_weights(self.emotion_head.out_proj) self.sentiment_head = BartClassificationHead( config.d_model, config.d_model, self.num_sentiments, config.classif_dropout ) self.model._init_weights(self.sentiment_head.dense) self.model._init_weights(self.sentiment_head.out_proj) def resize_token_embeddings(self, new_num_tokens): old_num_tokens = self.model.shared.num_embeddings new_embeddings = super().resize_token_embeddings(new_num_tokens) self.model.shared = new_embeddings self._resize_final_logits_bias(new_num_tokens, old_num_tokens) return new_embeddings def _resize_final_logits_bias(self, new_num_tokens, old_num_tokens): if new_num_tokens <= old_num_tokens: new_bias = self.final_logits_bias[:, :new_num_tokens] else: extra_bias = torch.zeros( (1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device ) new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) def forward( self, input_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, task=None, **unused ): if "lm_labels" in unused: warnings.warn( "The `lm_labels` argument is deprecated and will be removed " "in a future version, use `labels` instead.", DeprecationWarning ) labels = unused.pop("lm_labels") if labels is not None: use_cache = False outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, encoder_outputs=encoder_outputs, decoder_attention_mask=decoder_attention_mask, decoder_cached_states=decoder_cached_states, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states ) if task == "response": lm_logits = F.linear( outputs[0], self.model.shared.weight, bias=self.final_logits_bias ) outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here if labels is not None: loss_fct = nn.CrossEntropyLoss() # TODO(SS): do we need to ignore pad tokens in labels? masked_lm_loss = loss_fct( lm_logits.view(-1, self.config.vocab_size), labels.view(-1) ) outputs = (masked_lm_loss,) + outputs elif task in ["cfemotion", "emotion", "sentiment"]: x = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) if len(torch.unique(eos_mask.sum(1))) > 1: raise ValueError( "All examples must have the same number of <eos> tokens." ) if task == "cfemotion": classification_head = self.cfemotion_head num_labels = self.num_cfemotions elif task == "emotion": classification_head = self.emotion_head num_labels = self.num_emotions else: classification_head = self.sentiment_head num_labels = self.num_sentiments sentence_representation = x[eos_mask, :].view( x.size(0), -1, x.size(-1) )[:, -1, :] logits = classification_head(sentence_representation) # Prepend logits outputs = (logits,) + outputs[1:] # Add hidden states and attention if they are here if labels is not None: # prepend loss to output, loss = F.cross_entropy( logits.view(-1, num_labels), labels.view(-1) ) outputs = (loss,) + outputs else: raise ValueError("The dataset contains an invalid task.") return outputs def prepare_inputs_for_generation( self, decoder_input_ids, past, attention_mask, use_cache, task, **kwargs ): assert past is not None, "past has to be defined for encoder_outputs" encoder_outputs, decoder_cached_states = past return { "input_ids": None, # encoder_outputs is defined. input_ids not needed "encoder_outputs": encoder_outputs, "decoder_cached_states": decoder_cached_states, "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) "task": task } def adjust_logits_during_generation(self, logits, cur_len, max_length): if cur_len == 1: self._force_token_ids_generation(logits, self.config.bos_token_id) if cur_len == max_length - 1 and self.config.eos_token_id is not None: self._force_token_ids_generation(logits, self.config.eos_token_id) return logits def _force_token_ids_generation(self, scores, token_ids) -> None: if isinstance(token_ids, int): token_ids = [token_ids] all_but_token_ids_mask = torch.tensor( [x for x in range(self.config.vocab_size) if x not in token_ids], dtype=torch.long, device=next(self.parameters()).device ) assert len(scores.shape) == 2, \ "scores should be of rank 2 with shape: [batch_size, vocab_size]" scores[:, all_but_token_ids_mask] = -float("inf") @staticmethod def _reorder_cache(past, beam_idx): ((enc_out, enc_mask), decoder_cached_states) = past reordered_past = [] for layer_past in decoder_cached_states: # get the correct batch idx from decoder layer's batch dim for cross and self-attn layer_past_new = { attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() } reordered_past.append(layer_past_new) new_enc_out = ( enc_out if enc_out is None else enc_out.index_select(0, beam_idx) ) new_enc_mask = ( enc_mask if enc_mask is None else enc_mask.index_select(0, beam_idx) ) past = ((new_enc_out, new_enc_mask), reordered_past) return past def get_encoder(self): return self.model.encoder def get_output_embeddings(self): return _make_linear_from_emb(self.model.shared) # make it on the fly