def __init__(self, cfg: DictConfig, trainer: Trainer = None): # must assign tokenizers before init if cfg.language_model.pretrained_model_name: if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name: raise ValueError( "Must have either pretrained_model_name or both pretrained_encoder_model name and " "pretrained_decoder_model_name." ) # setup tokenizer self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens # set decoder to encoder self.decoder_tokenizer = self.encoder_tokenizer self.decoder_add_special_tokens = self.encoder_add_special_tokens else: if not ( cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name ): raise ValueError("Both encoder and decoder must be specified") # setup tokenizers self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer) self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens self.decoder_tokenizer = self.setup_tokenizer(cfg.decoder_tokenizer) self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens if not self.encoder_tokenizer: raise TypeError("encoder_tokenizer failed to initialize") if not self.decoder_tokenizer: raise TypeError("decoder_tokenizer failed to initialize") # init superclass super().__init__(cfg=cfg, trainer=trainer) # must assign modules after init if cfg.language_model.pretrained_model_name: # Setup end-to-end model if "bart" in cfg.language_model.pretrained_model_name: self.model = BartForConditionalGeneration.from_pretrained(cfg.language_model.pretrained_model_name) else: self.model = AutoModel.from_pretrained(cfg.language_model.pretrained_model_name) else: if not ( cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name ): raise ValueError("Both encoder and decoder must be specified") # Setup encoder/decoder model self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder=cfg.language_model.pretrained_encoder_model_name, decoder=cfg.language_model.pretrained_decoder_model_name, ) self.validation_perplexity = Perplexity(compute_on_step=False) self.setup_optimization(cfg.optim)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): if cfg.tokenizer is not None: self._setup_tokenizer(cfg.tokenizer) else: self.tokenizer = None super().__init__(cfg=cfg, trainer=trainer) self.bert_model = get_lm_model( pretrained_model_name=cfg.language_model.pretrained_model_name, config_file=cfg.language_model.config_file, config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None, checkpoint_file=cfg.language_model.lm_checkpoint, vocab_file=cfg.tokenizer.get('vocab_file') if cfg.tokenizer is not None else None, ) self.hidden_size = self.bert_model.config.hidden_size self.vocab_size = self.bert_model.config.vocab_size self.only_mlm_loss = cfg.only_mlm_loss self.mlm_classifier = BertPretrainingTokenClassifier( hidden_size=self.hidden_size, num_classes=self.vocab_size, num_layers=cfg.num_tok_classification_layers, activation="gelu", log_softmax=True, use_transformer_init=True, ) self.mlm_loss = SmoothedCrossEntropyLoss() if not self.only_mlm_loss: self.nsp_classifier = SequenceClassifier( hidden_size=self.hidden_size, num_classes=2, num_layers=cfg.num_seq_classification_layers, log_softmax=False, activation="tanh", use_transformer_init=True, ) self.nsp_loss = CrossEntropyLoss() self.agg_loss = AggregatorLoss(num_inputs=2) # # tie weights of MLM softmax layer and embedding layer of the encoder if (self.mlm_classifier.mlp.last_linear_layer.weight.shape != self.bert_model.embeddings.word_embeddings.weight.shape): raise ValueError( "Final classification layer does not match embedding layer.") self.mlm_classifier.mlp.last_linear_layer.weight = self.bert_model.embeddings.word_embeddings.weight # create extra bias # setup to track metrics self.validation_perplexity = Perplexity(compute_on_step=False) self.setup_optimization(cfg.optim)
def _perplexity_class_test( rank: int, worldsize: int, probs: Optional[torch.Tensor], logits: Optional[torch.Tensor], dist_sync_on_step: bool, metric_args: dict = {}, check_dist_sync_on_step: bool = True, check_batch: bool = True, atol: float = 1e-8, ): """ Utility function doing the actual comparison between lightning class metric and reference metric. Args: rank: rank of current process worldsize: number of processes probs: torch tensor with probabilities logits: torch tensor with logits. The function checks ``probs`` and ``logits are mutually exclusive for ``Perplexity`` metric. dist_sync_on_step: bool, if true will synchronize metric state across processes at each ``forward()`` metric_args: dict with additional arguments used for class initialization check_dist_sync_on_step: bool, if true will check if the metric is also correctly calculated per batch per device (and not just at the end) check_batch: bool, if true will check if the metric is also correctly calculated across devices for each batch (and not just at the end) """ # Instanciate lightning metric perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) if (probs is None) == (logits is None): with pytest.raises(ValueError): perplexity(probs, logits) return # verify perplexity works after being loaded from pickled state pickled_metric = pickle.dumps(perplexity) perplexity = pickle.loads(pickled_metric) for i in range(rank, NUM_BATCHES, worldsize): batch_result = perplexity(None if probs is None else probs[i], None if logits is None else logits[i]) if perplexity.dist_sync_on_step: if rank == 0: if probs is not None: ddp_probs = torch.stack( [probs[i + r] for r in range(worldsize)]) else: ddp_logits = torch.stack( [logits[i + r] for r in range(worldsize)]) ddp_probs = logits_to_probs(ddp_logits, is_binary=False) sk_batch_result = reference_perplexity_func(ddp_probs) # assert for dist_sync_on_step if check_dist_sync_on_step: assert np.allclose(batch_result.numpy(), sk_batch_result, atol=atol) else: if probs is None: p = logits_to_probs(logits[i], is_binary=False) else: p = probs[i] sk_batch_result = reference_perplexity_func(p) # assert for batch if check_batch: assert np.allclose(batch_result.numpy(), sk_batch_result, atol=atol) assert (probs is None) != (logits is None) # check on all batches on all ranks result = perplexity.compute() assert isinstance(result, torch.Tensor) if probs is None: probs = logits_to_probs(logits, is_binary=False) sk_result = reference_perplexity_func(probs) # assert after aggregation assert np.allclose(result.numpy(), sk_result, atol=atol)
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable self.global_rank = 0 self.world_size = 1 if trainer is not None: self.global_rank = (trainer.node_rank * trainer.num_gpus) + trainer.local_rank self.world_size = trainer.num_nodes * trainer.num_gpus # shared params for dataset and data loaders self.dataset_cfg = cfg.dataset self.tokenizer = get_tokenizer( tokenizer_name=cfg.language_model.tokenizer, vocab_file=cfg.language_model.vocab_file, special_tokens=cfg.language_model.special_tokens, ) # make vocabulary size divisible by 8 for fast fp16 training vocab_size = 8 * math.ceil(self.tokenizer.vocab_size / 8) # init superclass super().__init__(cfg=cfg, trainer=trainer) self.embedding_layer = TransformerEmbedding( vocab_size=vocab_size, hidden_size=cfg.language_model.hidden_size, max_sequence_length=cfg.language_model.max_seq_length, embedding_dropout=cfg.language_model.get("embedding_dropout", 0.0), learn_positional_encodings=False, ) self.encoder = TransformerEncoder( num_layers=cfg.language_model.num_layers, hidden_size=cfg.language_model.hidden_size, mask_future=True, num_attention_heads=cfg.language_model.num_attn_heads, inner_size=cfg.language_model.inner_size, ffn_dropout=cfg.language_model.get("ffn_dropout", 0.0), hidden_act=cfg.language_model.get("inner_activation", "relu"), attn_score_dropout=cfg.language_model.get("attn_score_dropout", 0.0), attn_layer_dropout=cfg.language_model.get("attn_layer_dropout", 0.0), ) self.log_softmax = TokenClassifier( hidden_size=cfg.language_model.hidden_size, num_classes=vocab_size, log_softmax=True, ) std_init_range = 1 / math.sqrt(cfg.language_model.hidden_size) self.apply( lambda module: transformer_weights_init(module, std_init_range)) # tie weights of embedding and softmax matrices self.log_softmax.mlp.layer0.weight = self.embedding_layer.token_embedding.weight self.training_loss = SmoothedCrossEntropyLoss( pad_id=self.tokenizer.pad_id) self.validation_loss = SmoothedCrossEntropyLoss( pad_id=self.tokenizer.pad_id, predict_last_k=self.dataset_cfg.get("predict_last_k", 0), ) self.training_perplexity = Perplexity(dist_sync_on_step=True) self.validation_perplexity = Perplexity(compute_on_step=False) # Optimizer setup needs to happen after all model weights are ready self.setup_optimization(cfg.optim)