def __init__(self, args: argparse.Namespace): """Initialize a model, tokenizer and config.""" super().__init__() self.args = args if isinstance(args, argparse.Namespace): self.save_hyperparameters(args) self.tokenizer = RobertaTokenizer.from_pretrained( self.args.roberta_path) self.model = RobertaForMaskedLM.from_pretrained(self.args.roberta_path) self.robert_config = RobertaConfig.from_pretrained( self.args.roberta_path, output_hidden_states=False) self.model = RobertaForMaskedLM(self.robert_config) self.loss_fn = CrossEntropyLoss(reduction="none") self.acc = MaskedAccuracy(num_classes=self.tokenizer.vocab_size)
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str, roberta_dump_folder_path: str): """ Copy/paste/tweak roberta's weights to our BERT structure. """ import pickle model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path) config = RobertaConfig.from_pretrained(pytorch_checkpoint_path) from argparse import Namespace huggingface_train_args = Namespace( **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin"))) model.eval() # disable dropout # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path) if config.num_hidden_layers == 12: roberta = FairseqRobertaModel.from_pretrained("roberta.base") elif config.num_hidden_layers == 24: roberta = FairseqRobertaModel.from_pretrained("roberta.large") else: raise Exception("Only roberta LM is supported!") roberta.eval() # roberta_sent_encoder = roberta.model.decoder.sentence_encoder # update config from huggingface and reuse lots of settings from fairseq pretrained roberta.args.warmup_updates = huggingface_train_args.warmup_steps roberta.args.weight_decay = huggingface_train_args.weight_decay roberta.args.adam_eps = huggingface_train_args.adam_epsilon roberta.args.clip_norm = huggingface_train_args.max_grad_norm roberta.args.max_update = huggingface_train_args.max_steps roberta.args.total_num_update = huggingface_train_args.max_steps roberta.args.save_interval_updates = huggingface_train_args.save_steps roberta.args.attention_dropout = config.attention_probs_dropout_prob roberta.args.encoder_embed_dim = config.hidden_size roberta.args.encoder_ffn_embed_dim = config.intermediate_size roberta.args.activation_fn = config.hidden_act roberta.args.activation_dropout = config.hidden_dropout_prob roberta.args.encoder_layers = config.num_hidden_layers roberta.args.encoder_attention_heads = config.num_attention_heads roberta.args.__dict__.update(huggingface_train_args.__dict__) roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn. k_proj.weight.data.shape == roberta.model.decoder. sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape == roberta.model.decoder.sentence_encoder.layers[i].self_attn. v_proj.weight.data.shape == torch.Size( (config.hidden_size, config.hidden_size))) roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.weight = self_attn.query.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.bias = self_attn.query.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.weight = self_attn.key.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.bias = self_attn.key.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.weight = self_attn.value.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.bias = self_attn.value.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight = self_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.bias = self_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.weight = self_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.bias = self_output.LayerNorm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight = intermediate.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc1.bias = intermediate.dense.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight = bert_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc2.bias = bert_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.weight = bert_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.bias = bert_output.LayerNorm.bias # LM Head roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 their_output = model(input_ids)[0] our_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 copy_success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if copy_success else "💩") if not copy_success: raise Exception("Something went wRoNg") pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {roberta_dump_folder_path}") from fairseq import checkpoint_utils state_dict = { "args": roberta.args, "model": roberta.model.state_dict(), # these last two were copied from fairseq pretrained just to make .from_pretrain() function works "extra_state": { 'train_iterator': { 'epoch': 0 }, 'val_loss': 1.4955725940408326 }, "optimizer_history": [{ 'criterion_name': 'MaskedLmLoss', 'optimizer_name': 'MemoryEfficientFP16Optimizer', 'lr_scheduler_state': { 'best': 1.495530066777925 }, 'num_updates': 500000 }] } from fairseq import checkpoint_utils # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), ) # del model checkpoint_utils.torch_persistent_save( state_dict, f"{roberta_dump_folder_path}/model.pt") loaded_model = FairseqRobertaModel.from_pretrained( roberta_dump_folder_path) loaded_model.eval() # roberta.model(input_ids) # loaded_model.model(input_ids) del state_dict copied_dict = roberta.state_dict() loaded_dict = loaded_model.state_dict() assert loaded_model.state_dict().keys() == roberta.state_dict().keys() for k in roberta.state_dict().keys(): loaded_val = loaded_dict[k] copied_val = copied_dict[k] if not torch.allclose(loaded_val, copied_val, atol=1e-3): print(k) loaded_output = loaded_model.model(input_ids)[0] save_success = torch.allclose(our_output, loaded_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if save_success else "💩") if not save_success: raise Exception("Something went wRoNg") # except: # print("Fail to save") # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt") print("Done")
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, predictions_file=None, layer_freeze_regexes: List[str] = None, probe_type: str = None, loss_on_all_vocab: bool = False, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._loss_on_all_vocab = loss_on_all_vocab self._predictions_file = predictions_file # TODO move to predict if predictions_file is not None and os.path.isfile(predictions_file): os.remove(predictions_file) self._pretrained_model = pretrained_model if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if loss_on_all_vocab: self._transformer_model = RobertaForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained( pretrained_model) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetLMHeadModel.from_pretrained( pretrained_model) elif 'albert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = AlbertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token elif 'bert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = BertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token else: assert (ValueError) if probe_type == 'MLP': layer_freeze_regexes = ["embeddings", "encoder", "pooler"] elif probe_type == 'linear': layer_freeze_regexes = [ "embeddings", "encoder", "pooler", "dense", "LayerNorm", "layer_norm" ] for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False # make sure decode gredients are on. if 'roberta' in pretrained_model: self._transformer_model.lm_head.decoder.weight.requires_grad = True self._transformer_model.lm_head.bias.requires_grad = True elif 'albert' in pretrained_model: pass elif 'bert' in pretrained_model: self._transformer_model.cls.predictions.decoder.weight.requires_grad = True self._transformer_model.cls.predictions.bias.requires_grad = True transformer_config = self._transformer_model.config transformer_config.num_labels = 1 self._output_dim = self._transformer_model.config.hidden_size self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = 2
def convert_roberta_checkpoint_to_pytorch(fairseq_default_path, hf_input_path): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta_hf = RobertaForMaskedLM.from_pretrained(hf_input_path) roberta_fairseq = FairseqRobertaModel.from_pretrained(fairseq_default_path) # Now let's copy all the weights. # Embeddings roberta_hf_sent_encoder = roberta_hf.roberta.embeddings roberta_fairseq.model.decoder.sentence_encoder.embed_tokens.weight = roberta_hf_sent_encoder.word_embeddings.weight # fairseq roberta doesn't use `token_type_embeddings`, so as a workaround, add it to the `position_embeddings` roberta_fairseq.model.decoder.sentence_encoder.embed_positions.weight.data = roberta_hf_sent_encoder.position_embeddings.weight.data + roberta_hf_sent_encoder.token_type_embeddings.weight.data roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.weight = roberta_hf_sent_encoder.LayerNorm.weight roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.bias = roberta_hf_sent_encoder.LayerNorm.bias for i in range(len(roberta_hf.roberta.encoder.layer)): # Encoder: start of layer roberta_hf_layer: BertLayer = roberta_hf.roberta.encoder.layer[i] roberta_fairseq_layer: TransformerSentenceEncoderLayer = roberta_fairseq.model.decoder.sentence_encoder.layers[i] # roberta_fairseq_layer.self_attn.enable_torch_version = False # self attention hf_self_attn: BertSelfAttention = roberta_hf_layer.attention.self fairseq_self_attn: BertSelfAttention = roberta_fairseq_layer.self_attn fairseq_self_attn.q_proj.weight = hf_self_attn.query.weight fairseq_self_attn.q_proj.bias = hf_self_attn.query.bias fairseq_self_attn.k_proj.weight = hf_self_attn.key.weight fairseq_self_attn.k_proj.bias = hf_self_attn.key.bias fairseq_self_attn.v_proj.weight = hf_self_attn.value.weight fairseq_self_attn.v_proj.bias = hf_self_attn.value.bias # self-attention output hf_self_output: BertSelfOutput = roberta_hf_layer.attention.output assert( hf_self_output.dense.weight.shape == roberta_fairseq_layer.self_attn.out_proj.weight.shape ) roberta_fairseq_layer.self_attn.out_proj.weight = hf_self_output.dense.weight roberta_fairseq_layer.self_attn.out_proj.bias = hf_self_output.dense.bias roberta_fairseq_layer.self_attn_layer_norm.weight = hf_self_output.LayerNorm.weight roberta_fairseq_layer.self_attn_layer_norm.bias = hf_self_output.LayerNorm.bias # intermediate hf_intermediate: BertIntermediate = roberta_hf_layer.intermediate assert( hf_intermediate.dense.weight.shape == roberta_fairseq_layer.fc1.weight.shape ) roberta_fairseq_layer.fc1.weight = hf_intermediate.dense.weight roberta_fairseq_layer.fc1.bias = hf_intermediate.dense.bias # output hf_bert_output: BertOutput = roberta_hf_layer.output assert( hf_bert_output.dense.weight.shape == roberta_fairseq_layer.fc2.weight.shape ) roberta_fairseq_layer.fc2.weight = hf_bert_output.dense.weight roberta_fairseq_layer.fc2.bias = hf_bert_output.dense.bias roberta_fairseq_layer.final_layer_norm.weight = hf_bert_output.LayerNorm.weight roberta_fairseq_layer.final_layer_norm.bias = hf_bert_output.LayerNorm.bias # end of layer roberta_fairseq.model.decoder.lm_head.dense.weight = roberta_hf.lm_head.dense.weight roberta_fairseq.model.decoder.lm_head.dense.bias = roberta_hf.lm_head.dense.bias roberta_fairseq.model.decoder.lm_head.layer_norm.weight = roberta_hf.lm_head.layer_norm.weight roberta_fairseq.model.decoder.lm_head.layer_norm.bias = roberta_hf.lm_head.layer_norm.bias roberta_fairseq.model.decoder.lm_head.weight = roberta_hf.lm_head.decoder.weight roberta_fairseq.model.decoder.lm_head.bias = roberta_hf.lm_head.bias # Let's check that we get the same results. roberta_hf.eval() # disable dropout roberta_fairseq.eval() # disable dropout input_ids: torch.Tensor = roberta_fairseq.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = roberta_hf(input_ids)[0] their_output = roberta_fairseq.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print( "Do both models output the same tensors?", "🔥" if success else "💩" ) if not success: raise Exception("Something went wRoNg") with open(f'{fairseq_default_path}/model.pt', 'rb') as f: roberta_fairseq_checkpoint = torch.load(f) roberta_fairseq_checkpoint['model'] = roberta_fairseq.model.state_dict() fairseq_output_checkpoint_path = f'{hf_input_path}/fairseq.pt' print(f"Saving model to {fairseq_output_checkpoint_path}") with open(fairseq_output_checkpoint_path, 'wb') as f: torch.save(roberta_fairseq_checkpoint, f)