Пример #1
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # must assign tokenizers before init
        if cfg.language_model.pretrained_model_name:
            if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name:
                raise ValueError(
                    "Must have either pretrained_model_name or both pretrained_encoder_model name and "
                    "pretrained_decoder_model_name."
                )
            # setup tokenizer
            self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            # set decoder to encoder
            self.decoder_tokenizer = self.encoder_tokenizer
            self.decoder_add_special_tokens = self.encoder_add_special_tokens
        else:
            if not (
                cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name
            ):
                raise ValueError("Both encoder and decoder must be specified")

            # setup tokenizers
            self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            self.decoder_tokenizer = self.setup_tokenizer(cfg.decoder_tokenizer)
            self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens

        if not self.encoder_tokenizer:
            raise TypeError("encoder_tokenizer failed to initialize")
        if not self.decoder_tokenizer:
            raise TypeError("decoder_tokenizer failed to initialize")

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        # must assign modules after init
        if cfg.language_model.pretrained_model_name:
            # Setup end-to-end model
            if "bart" in cfg.language_model.pretrained_model_name:
                self.model = BartForConditionalGeneration.from_pretrained(cfg.language_model.pretrained_model_name)
            else:
                self.model = AutoModel.from_pretrained(cfg.language_model.pretrained_model_name)
        else:
            if not (
                cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name
            ):
                raise ValueError("Both encoder and decoder must be specified")

            # Setup encoder/decoder model
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder=cfg.language_model.pretrained_encoder_model_name,
                decoder=cfg.language_model.pretrained_decoder_model_name,
            )

        self.validation_perplexity = Perplexity(compute_on_step=False)

        self.setup_optimization(cfg.optim)
Пример #2
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        if cfg.tokenizer is not None:
            self._setup_tokenizer(cfg.tokenizer)
        else:
            self.tokenizer = None

        super().__init__(cfg=cfg, trainer=trainer)

        self.bert_model = get_lm_model(
            pretrained_model_name=cfg.language_model.pretrained_model_name,
            config_file=cfg.language_model.config_file,
            config_dict=OmegaConf.to_container(cfg.language_model.config)
            if cfg.language_model.config else None,
            checkpoint_file=cfg.language_model.lm_checkpoint,
            vocab_file=cfg.tokenizer.get('vocab_file')
            if cfg.tokenizer is not None else None,
        )

        self.hidden_size = self.bert_model.config.hidden_size
        self.vocab_size = self.bert_model.config.vocab_size
        self.only_mlm_loss = cfg.only_mlm_loss

        self.mlm_classifier = BertPretrainingTokenClassifier(
            hidden_size=self.hidden_size,
            num_classes=self.vocab_size,
            num_layers=cfg.num_tok_classification_layers,
            activation="gelu",
            log_softmax=True,
            use_transformer_init=True,
        )

        self.mlm_loss = SmoothedCrossEntropyLoss()

        if not self.only_mlm_loss:
            self.nsp_classifier = SequenceClassifier(
                hidden_size=self.hidden_size,
                num_classes=2,
                num_layers=cfg.num_seq_classification_layers,
                log_softmax=False,
                activation="tanh",
                use_transformer_init=True,
            )

            self.nsp_loss = CrossEntropyLoss()
            self.agg_loss = AggregatorLoss(num_inputs=2)

        # # tie weights of MLM softmax layer and embedding layer of the encoder
        if (self.mlm_classifier.mlp.last_linear_layer.weight.shape !=
                self.bert_model.embeddings.word_embeddings.weight.shape):
            raise ValueError(
                "Final classification layer does not match embedding layer.")
        self.mlm_classifier.mlp.last_linear_layer.weight = self.bert_model.embeddings.word_embeddings.weight
        # create extra bias

        # setup to track metrics
        self.validation_perplexity = Perplexity(compute_on_step=False)

        self.setup_optimization(cfg.optim)
Пример #3
0
def _perplexity_class_test(
    rank: int,
    worldsize: int,
    probs: Optional[torch.Tensor],
    logits: Optional[torch.Tensor],
    dist_sync_on_step: bool,
    metric_args: dict = {},
    check_dist_sync_on_step: bool = True,
    check_batch: bool = True,
    atol: float = 1e-8,
):
    """ Utility function doing the actual comparison between lightning class metric
        and reference metric.
        Args:
            rank: rank of current process
            worldsize: number of processes
            probs: torch tensor with probabilities
            logits: torch tensor with logits. The function checks ``probs`` and ``logits are mutually exclusive for
                ``Perplexity`` metric.
            dist_sync_on_step: bool, if true will synchronize metric state across
                processes at each ``forward()``
            metric_args: dict with additional arguments used for class initialization
            check_dist_sync_on_step: bool, if true will check if the metric is also correctly
                calculated per batch per device (and not just at the end)
            check_batch: bool, if true will check if the metric is also correctly
                calculated across devices for each batch (and not just at the end)
    """
    # Instanciate lightning metric
    perplexity = Perplexity(compute_on_step=True,
                            dist_sync_on_step=dist_sync_on_step,
                            **metric_args)
    if (probs is None) == (logits is None):
        with pytest.raises(ValueError):
            perplexity(probs, logits)
        return

    # verify perplexity works after being loaded from pickled state
    pickled_metric = pickle.dumps(perplexity)
    perplexity = pickle.loads(pickled_metric)

    for i in range(rank, NUM_BATCHES, worldsize):
        batch_result = perplexity(None if probs is None else probs[i],
                                  None if logits is None else logits[i])

        if perplexity.dist_sync_on_step:
            if rank == 0:
                if probs is not None:
                    ddp_probs = torch.stack(
                        [probs[i + r] for r in range(worldsize)])
                else:
                    ddp_logits = torch.stack(
                        [logits[i + r] for r in range(worldsize)])
                    ddp_probs = logits_to_probs(ddp_logits, is_binary=False)
                sk_batch_result = reference_perplexity_func(ddp_probs)
                # assert for dist_sync_on_step
                if check_dist_sync_on_step:
                    assert np.allclose(batch_result.numpy(),
                                       sk_batch_result,
                                       atol=atol)
        else:
            if probs is None:
                p = logits_to_probs(logits[i], is_binary=False)
            else:
                p = probs[i]
            sk_batch_result = reference_perplexity_func(p)
            # assert for batch
            if check_batch:
                assert np.allclose(batch_result.numpy(),
                                   sk_batch_result,
                                   atol=atol)

    assert (probs is None) != (logits is None)
    # check on all batches on all ranks
    result = perplexity.compute()
    assert isinstance(result, torch.Tensor)

    if probs is None:
        probs = logits_to_probs(logits, is_binary=False)
    sk_result = reference_perplexity_func(probs)

    # assert after aggregation
    assert np.allclose(result.numpy(), sk_result, atol=atol)
Пример #4
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
        self.global_rank = 0
        self.world_size = 1
        if trainer is not None:
            self.global_rank = (trainer.node_rank *
                                trainer.num_gpus) + trainer.local_rank
            self.world_size = trainer.num_nodes * trainer.num_gpus

        # shared params for dataset and data loaders
        self.dataset_cfg = cfg.dataset
        self.tokenizer = get_tokenizer(
            tokenizer_name=cfg.language_model.tokenizer,
            vocab_file=cfg.language_model.vocab_file,
            special_tokens=cfg.language_model.special_tokens,
        )

        # make vocabulary size divisible by 8 for fast fp16 training
        vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8)

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        self.embedding_layer = TransformerEmbedding(
            vocab_size=vocab_size,
            hidden_size=cfg.language_model.hidden_size,
            max_sequence_length=cfg.language_model.max_seq_length,
            embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0),
            learn_positional_encodings=False,
        )
        self.encoder = TransformerEncoder(
            num_layers=cfg.language_model.num_layers,
            hidden_size=cfg.language_model.hidden_size,
            mask_future=True,
            num_attention_heads=cfg.language_model.num_attn_heads,
            inner_size=cfg.language_model.inner_size,
            ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0),
            hidden_act=cfg.language_model.get("inner_activation", "relu"),
            attn_score_dropout=cfg.language_model.get("attn_score_dropout",
                                                      0.0),
            attn_layer_dropout=cfg.language_model.get("attn_layer_dropout",
                                                      0.0),
        )
        self.log_softmax = TokenClassifier(
            hidden_size=cfg.language_model.hidden_size,
            num_classes=vocab_size,
            log_softmax=True,
        )

        std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size)
        self.apply(
            lambda module: transformer_weights_init(module, std_init_range))

        # tie weights of embedding and softmax matrices
        self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight

        self.training_loss = SmoothedCrossEntropyLoss(
            pad_id=self.tokenizer.pad_id)
        self.validation_loss = SmoothedCrossEntropyLoss(
            pad_id=self.tokenizer.pad_id,
            predict_last_k=self.dataset_cfg.get("predict_last_k", 0),
        )

        self.training_perplexity = Perplexity(dist_sync_on_step=True)
        self.validation_perplexity = Perplexity(compute_on_step=False)

        # Optimizer setup needs to happen after all model weights are ready
        self.setup_optimization(cfg.optim)