예제 #1
0
 def __init__(self, config: BartConfig):
     super().__init__(config)
     base_model = BartModel(config)
     self.model = base_model
     self.register_buffer(
         "final_logits_bias",
         torch.zeros((1, self.model.shared.num_embeddings)))
    def __init__(self, tagset_size):
        super(RobertaForSequenceClassification, self).__init__()
        self.tagset_size = tagset_size

        # self.roberta_single= RobertaModel.from_pretrained(pretrain_model_dir)
        self.roberta_single = BartModel.from_pretrained(pretrain_model_dir)
        # self.single_hidden2tag = nn.Linear(bert_hidden_dim, tagset_size)
        self.single_hidden2tag = RobertaClassificationHead(
            bert_hidden_dim, tagset_size)
        self.config = BartConfig(pretrain_model_dir)
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)

        self.model = BartModel(config)
        self.register_buffer(
            "final_logits_bias",
            torch.zeros((1, self.model.shared.num_embeddings))
        )

        self.num_cfemotions = 12
        self.num_emotions = 6
        self.num_sentiments = 2

        self.cfemotion_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_cfemotions,
            config.classif_dropout
        )
        self.model._init_weights(self.cfemotion_head.dense)
        self.model._init_weights(self.cfemotion_head.out_proj)

        self.emotion_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_emotions,
            config.classif_dropout
        )
        self.model._init_weights(self.emotion_head.dense)
        self.model._init_weights(self.emotion_head.out_proj)

        self.sentiment_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_sentiments,
            config.classif_dropout
        )
        self.model._init_weights(self.sentiment_head.dense)
        self.model._init_weights(self.sentiment_head.out_proj)
예제 #4
0
    def from_pretrained_multi(
        cls, full_model_name_or_path, full_model_config=None, final_layers=2
    ):
        """Load either a full seq2seq model, or pre-load model from
        a separate encoder and decoder stacks."""

        bart_config = BartConfig.from_pretrained(full_model_name_or_path)
        bart_config.final_layers = final_layers
        # initialize with random weights
        model = cls(bart_config)

        # this is actually a complete checkpoint
        if os.path.exists(full_model_name_or_path):
            ckpt = torch.load(
                os.path.join(full_model_name_or_path, "pytorch_model.bin"),
                map_location="cpu",
            )
            model.load_state_dict(ckpt)
            return model

        # initialize scratch
        bart_model = BartModel.from_pretrained(full_model_name_or_path)

        # encoder full copy
        model.model.encoder.load_state_dict(bart_model.encoder.state_dict())
        model.model.decoder.embed_tokens.load_state_dict(
            bart_model.decoder.embed_tokens.state_dict()
        )
        model.model.decoder.embed_positions.load_state_dict(
            bart_model.decoder.embed_positions.state_dict()
        )

        # excluding the last layer
        for ml, bl in zip(model.model.decoder.layers, bart_model.decoder.layers):
            ml.load_state_dict(bl.state_dict())

        for ml in model.model.decoder.final_layers:
            ml.load_state_dict(bart_model.decoder.layers[-1].state_dict())

        del bart_model

        return model
예제 #5
0
 def __init__(self, config, project_dim: int = 0):
     BartModel.__init__(self, config)
     assert config.hidden_size > 0, "Encoder hidden_size can't be zero"
     self.encode_proj = nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None
     self.init_weights()
예제 #6
0
class BartForTokenClassification(PretrainedBartModel):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
        self.classification_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classif_dropout,
        )
        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        encoder_outputs=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        use_cache=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs:
            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
                Classification loss (cross entropy)
            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
                Classification (or regression if config.num_labels==1) scores (before SoftMax).
            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
                Attentions weights after the attention softmax, used to compute the weighted average in the
                self-attention
                heads.
        """
        if labels is not None:
            use_cache = False

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_outputs=encoder_outputs,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
        )
        x = outputs[0]  # last hidden state
        logits = self.classification_head(x)
        # Prepend logits
        outputs = (logits, ) + outputs[
            1:]  # Add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.config.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1),
                    torch.tensor(loss_fct.ignore_index).type_as(labels))
                loss = F.cross_entropy(active_logits, active_labels)
            else:
                loss = F.cross_entropy(logits.view(-1, self.config.num_labels),
                                       labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs
class BartForMultitaskLearning(PretrainedBartModel):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)

        self.model = BartModel(config)
        self.register_buffer(
            "final_logits_bias",
            torch.zeros((1, self.model.shared.num_embeddings))
        )

        self.num_cfemotions = 12
        self.num_emotions = 6
        self.num_sentiments = 2

        self.cfemotion_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_cfemotions,
            config.classif_dropout
        )
        self.model._init_weights(self.cfemotion_head.dense)
        self.model._init_weights(self.cfemotion_head.out_proj)

        self.emotion_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_emotions,
            config.classif_dropout
        )
        self.model._init_weights(self.emotion_head.dense)
        self.model._init_weights(self.emotion_head.out_proj)

        self.sentiment_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            self.num_sentiments,
            config.classif_dropout
        )
        self.model._init_weights(self.sentiment_head.dense)
        self.model._init_weights(self.sentiment_head.out_proj)

    def resize_token_embeddings(self, new_num_tokens):
        old_num_tokens = self.model.shared.num_embeddings
        new_embeddings = super().resize_token_embeddings(new_num_tokens)
        self.model.shared = new_embeddings
        self._resize_final_logits_bias(new_num_tokens, old_num_tokens)
        return new_embeddings

    def _resize_final_logits_bias(self, new_num_tokens, old_num_tokens):
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            extra_bias = torch.zeros(
                (1, new_num_tokens - old_num_tokens),
                device=self.final_logits_bias.device
            )
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        encoder_outputs=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        decoder_cached_states=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        task=None,
        **unused
    ):
        if "lm_labels" in unused:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed "
                "in a future version, use `labels` instead.",
                DeprecationWarning
            )
            labels = unused.pop("lm_labels")

        if labels is not None:
            use_cache = False

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            decoder_cached_states=decoder_cached_states,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states
        )

        if task == "response":
            lm_logits = F.linear(
                outputs[0],
                self.model.shared.weight,
                bias=self.final_logits_bias
            )
            outputs = (lm_logits,) + outputs[1:]  # Add cache, hidden states and attention if they are here

            if labels is not None:
                loss_fct = nn.CrossEntropyLoss()
                # TODO(SS): do we need to ignore pad tokens in labels?
                masked_lm_loss = loss_fct(
                    lm_logits.view(-1, self.config.vocab_size),
                    labels.view(-1)
                )
                outputs = (masked_lm_loss,) + outputs

        elif task in ["cfemotion", "emotion", "sentiment"]:
            x = outputs[0]  # last hidden state

            eos_mask = input_ids.eq(self.config.eos_token_id)
            if len(torch.unique(eos_mask.sum(1))) > 1:
                raise ValueError(
                   "All examples must have the same number of <eos> tokens."
                )

            if task == "cfemotion":
                classification_head = self.cfemotion_head
                num_labels = self.num_cfemotions
            elif task == "emotion":
                classification_head = self.emotion_head
                num_labels = self.num_emotions
            else:
                classification_head = self.sentiment_head
                num_labels = self.num_sentiments

            sentence_representation = x[eos_mask, :].view(
                x.size(0),
                -1,
                x.size(-1)
            )[:, -1, :]
            logits = classification_head(sentence_representation)

            # Prepend logits
            outputs = (logits,) + outputs[1:]  # Add hidden states and attention if they are here
            if labels is not None:  # prepend loss to output,
                loss = F.cross_entropy(
                    logits.view(-1, num_labels),
                    labels.view(-1)
                )
                outputs = (loss,) + outputs
        
        else:
            raise ValueError("The dataset contains an invalid task.")

        return outputs

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past,
        attention_mask,
        use_cache,
        task,
        **kwargs
    ):
        assert past is not None, "past has to be defined for encoder_outputs"

        encoder_outputs, decoder_cached_states = past
        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "decoder_cached_states": decoder_cached_states,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
            "task": task
        }

    def adjust_logits_during_generation(self, logits, cur_len, max_length):
        if cur_len == 1:
            self._force_token_ids_generation(logits, self.config.bos_token_id)
        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
            self._force_token_ids_generation(logits, self.config.eos_token_id)
        return logits

    def _force_token_ids_generation(self, scores, token_ids) -> None:
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        all_but_token_ids_mask = torch.tensor(
            [x for x in range(self.config.vocab_size) if x not in token_ids],
            dtype=torch.long,
            device=next(self.parameters()).device
        )
        assert len(scores.shape) == 2, \
            "scores should be of rank 2 with shape: [batch_size, vocab_size]"
        scores[:, all_but_token_ids_mask] = -float("inf")

    @staticmethod
    def _reorder_cache(past, beam_idx):
        ((enc_out, enc_mask), decoder_cached_states) = past
        reordered_past = []
        for layer_past in decoder_cached_states:
            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
            layer_past_new = {
                attn_key: _reorder_buffer(attn_cache, beam_idx)
                for attn_key, attn_cache in layer_past.items()
            }
            reordered_past.append(layer_past_new)

        new_enc_out = (
            enc_out if enc_out is None 
            else enc_out.index_select(0, beam_idx)
        )
        new_enc_mask = (
            enc_mask if enc_mask is None
            else enc_mask.index_select(0, beam_idx)
        )

        past = ((new_enc_out, new_enc_mask), reordered_past)
        return past

    def get_encoder(self):
        return self.model.encoder

    def get_output_embeddings(self):
        return _make_linear_from_emb(self.model.shared)  # make it on the fly