def save_best_averaged_checkpoint(self, args, trainer, extra_state: Dict[str, Any]): """ save() should always be called before calling this function - to ensure that extra_state and self._averaged_params have been updated correctly. """ best_averaged_checkpoint_filename = os.path.join( args.save_dir, constants.AVERAGED_CHECKPOINT_BEST_FILENAME ) self.log_if_verbose( f"| Preparing to save new best averaged checkpoint to " f"{best_averaged_checkpoint_filename}." ) state_dict = trainer.state_dict() state_dict["args"] = args state_dict["cfg"] = None state_dict["model"] = self._averaged_params state_dict["extra_state"].update(extra_state) state_dict = fairseq_utils.move_to_cpu(state_dict) checkpoint_utils.torch_persistent_save( obj=state_dict, filename=best_averaged_checkpoint_filename, ) self.log_if_verbose( f"| Finished saving new best averaged checkpoint to " f"{best_averaged_checkpoint_filename}." )
def test_torch_persistent_save_async(self): state_dict = {} filename = "async_checkpoint.pt" with patch(f"{checkpoint_utils.__name__}.PathManager.opena" ) as mock_opena: with patch(f"{checkpoint_utils.__name__}._torch_persistent_save" ) as mock_save: checkpoint_utils.torch_persistent_save(state_dict, filename, async_write=True) mock_opena.assert_called_with(filename, "wb") mock_save.assert_called()
def test_torch_persistent_save_async(self): cfg = OmegaConf.create() cfg.dataset = OmegaConf.create() cfg.dataset.write_checkpoints_asynchronously = True state_dict = {} filename = "async_checkpoint.pt" with patch(f"{checkpoint_utils.__name__}.PathManager.opena" ) as mock_opena: with patch(f"{checkpoint_utils.__name__}._torch_persistent_save" ) as mock_save: checkpoint_utils.torch_persistent_save(cfg.dataset, state_dict, filename) mock_opena.assert_called_with(filename, "wb") mock_save.assert_called()
def split_create(model, source_path = "/home/wannabe/Documents/ufal-transformer-big/transformer_checkpoints/checkpoint_last.pt", target_path = "/home/wannabe/Documents/ufal-transformer-big/encoder_checkpoints/checkpoint_last.pt" ): """ Args: source_path: A fairseq.Language_model that whose params will be initialized with the params from the Transformer model. target_path: A fairseq.Transformer model that has been trained on the Translation task modified_path: A string object denoting the path to where you wish to store the model """ #check if the file exists, it it does return if os.path.isfile(target_path): #print ("Inside the if clause") return extended_list = [] for key in model.state_dict().keys(): if key.startswith('encoder.layer_norm') or key.startswith('out_layer'): extended_list.append((key, model.state_dict()[key])) translation_state = checkpoint_utils.load_checkpoint_to_cpu(source_path) #filtered state has the encoder parts of the translation model filtered_state = [] for key in translation_state['model'].keys(): if key.startswith('encoder'): filtered_state.append((key, translation_state['model'][key])) filtered_state.extend(extended_list) list_encoder_state_dict = OrderedDict(filtered_state) translation_state['model'] = list_encoder_state_dict #save the encoder weights of the translation model at the target path checkpoint_utils.torch_persistent_save(translation_state, target_path) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("--checkpoint", type=str, required=True, help="Wav2Vec checkpoint to be prepared") args = parser.parse_args() ckpt = load_checkpoint_to_cpu(args.checkpoint) ckpt['args'] = None ckpt['cfg'] = ckpt['cfg']['model']['w2v_args'] for key in list(ckpt['model'].keys()): w = ckpt['model'].pop(key) if key.startswith('w2v_encoder.w2v_model.'): new_key = key.replace('w2v_encoder.w2v_model.', '') ckpt['model'][new_key] = w # These are Wav2Vec2 parameters which will be removed in Wav2VecEncoder if 'small' in args.checkpoint.split('/')[-1]: ckpt['model']['quantizer.vars'] = torch.randn(1, 640, 128) ckpt['model']['quantizer.weight_proj.weight'] = torch.randn(640, 512) ckpt['model']['quantizer.weight_proj.bias'] = torch.randn(640) ckpt['model']['project_q.weight'] = torch.randn(256, 256) ckpt['model']['project_q.bias'] = torch.randn(256) ckpt['model']['final_proj.weight'] = torch.randn(256, 768) ckpt['model']['final_proj.bias'] = torch.randn(256) else: ckpt['model']['quantizer.vars'] = torch.randn(1, 640, 384) ckpt['model']['quantizer.weight_proj.weight'] = torch.randn(640, 512) ckpt['model']['quantizer.weight_proj.bias'] = torch.randn(640) ckpt['model']['project_q.weight'] = torch.randn(768, 768) ckpt['model']['project_q.bias'] = torch.randn(768) ckpt['model']['final_proj.weight'] = torch.randn(768, 1024) ckpt['model']['final_proj.bias'] = torch.randn(768) torch_persistent_save(ckpt, args.checkpoint)
def init_tmodel(source_path, target_path, modified_path): """ Args: source_path: A fairseq.Language_model that whose params will be initialized with the params from the Transformer model. target_path: A fairseq.Transformer model that has been trained on the Translation task modified_path: A string object denoting the path to where you wish to store the model """ encoder_state = checkpoint_utils.load_checkpoint_to_cpu(source_path) translation_state = checkpoint_utils.load_checkpoint_to_cpu(target_path) filtered_state = [] for key in encoder_state['model'].keys(): filtered_state.append((key, encoder_state['model'][key])) #Remove the linear and layer norm layers to maintain compatibiility filtered_state.pop() filtered_state.pop() filtered_state.pop() filtered_state.pop() list_translation_state = [] for key in translation_state['model'].keys(): list_translation_state.append((key, translation_state['model'][key])) for index, key in enumerate(list_translation_state): if key[0].startswith('encoder'): list_translation_state[index] = filtered_state[index] list_translation_state_dict = OrderedDict(list_translation_state) translation_state['model'] = list_translation_state_dict checkpoint_utils.torch_persistent_save(translation_state, modified_path) return
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str, roberta_dump_folder_path: str): """ Copy/paste/tweak roberta's weights to our BERT structure. """ import pickle model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path) config = RobertaConfig.from_pretrained(pytorch_checkpoint_path) from argparse import Namespace huggingface_train_args = Namespace( **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin"))) model.eval() # disable dropout # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path) if config.num_hidden_layers == 12: roberta = FairseqRobertaModel.from_pretrained("roberta.base") elif config.num_hidden_layers == 24: roberta = FairseqRobertaModel.from_pretrained("roberta.large") else: raise Exception("Only roberta LM is supported!") roberta.eval() # roberta_sent_encoder = roberta.model.decoder.sentence_encoder # update config from huggingface and reuse lots of settings from fairseq pretrained roberta.args.warmup_updates = huggingface_train_args.warmup_steps roberta.args.weight_decay = huggingface_train_args.weight_decay roberta.args.adam_eps = huggingface_train_args.adam_epsilon roberta.args.clip_norm = huggingface_train_args.max_grad_norm roberta.args.max_update = huggingface_train_args.max_steps roberta.args.total_num_update = huggingface_train_args.max_steps roberta.args.save_interval_updates = huggingface_train_args.save_steps roberta.args.attention_dropout = config.attention_probs_dropout_prob roberta.args.encoder_embed_dim = config.hidden_size roberta.args.encoder_ffn_embed_dim = config.intermediate_size roberta.args.activation_fn = config.hidden_act roberta.args.activation_dropout = config.hidden_dropout_prob roberta.args.encoder_layers = config.num_hidden_layers roberta.args.encoder_attention_heads = config.num_attention_heads roberta.args.__dict__.update(huggingface_train_args.__dict__) roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn. k_proj.weight.data.shape == roberta.model.decoder. sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape == roberta.model.decoder.sentence_encoder.layers[i].self_attn. v_proj.weight.data.shape == torch.Size( (config.hidden_size, config.hidden_size))) roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.weight = self_attn.query.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.bias = self_attn.query.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.weight = self_attn.key.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.bias = self_attn.key.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.weight = self_attn.value.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.bias = self_attn.value.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight = self_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.bias = self_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.weight = self_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.bias = self_output.LayerNorm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight = intermediate.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc1.bias = intermediate.dense.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight = bert_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc2.bias = bert_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.weight = bert_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.bias = bert_output.LayerNorm.bias # LM Head roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 their_output = model(input_ids)[0] our_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 copy_success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if copy_success else "💩") if not copy_success: raise Exception("Something went wRoNg") pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {roberta_dump_folder_path}") from fairseq import checkpoint_utils state_dict = { "args": roberta.args, "model": roberta.model.state_dict(), # these last two were copied from fairseq pretrained just to make .from_pretrain() function works "extra_state": { 'train_iterator': { 'epoch': 0 }, 'val_loss': 1.4955725940408326 }, "optimizer_history": [{ 'criterion_name': 'MaskedLmLoss', 'optimizer_name': 'MemoryEfficientFP16Optimizer', 'lr_scheduler_state': { 'best': 1.495530066777925 }, 'num_updates': 500000 }] } from fairseq import checkpoint_utils # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), ) # del model checkpoint_utils.torch_persistent_save( state_dict, f"{roberta_dump_folder_path}/model.pt") loaded_model = FairseqRobertaModel.from_pretrained( roberta_dump_folder_path) loaded_model.eval() # roberta.model(input_ids) # loaded_model.model(input_ids) del state_dict copied_dict = roberta.state_dict() loaded_dict = loaded_model.state_dict() assert loaded_model.state_dict().keys() == roberta.state_dict().keys() for k in roberta.state_dict().keys(): loaded_val = loaded_dict[k] copied_val = copied_dict[k] if not torch.allclose(loaded_val, copied_val, atol=1e-3): print(k) loaded_output = loaded_model.model(input_ids)[0] save_success = torch.allclose(our_output, loaded_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if save_success else "💩") if not save_success: raise Exception("Something went wRoNg") # except: # print("Fail to save") # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt") print("Done")
def main(args): state = checkpoint_utils.load_checkpoint_to_cpu(args.checkpoint) ns = state["args"] model = state["model"] ns.arch = "transformer_modular" if (args.encoder_attention_heads_active is None and args.decoder_attention_heads_active is None): raise ValueError( 'Either --encoder-attention-heads-active or ' '--decoder-attention-heads-active option must be set.') if args.encoder_attention_heads_active is None: args.encoder_attention_heads_active = args.decoder_attention_heads_active if args.encoder_modular_layer_indices is not None: ns.encoder_modular_layer_indices = "({})".format( args.encoder_modular_layer_indices) model = convert_model(model, ns, coder="encoder", att_type="self_attn") if args.decoder_modular_layer_indices is not None: ns.decoder_modular_layer_indices = "({})".format( args.decoder_modular_layer_indices) model = convert_model(model, ns, coder="decoder", att_type="self_attn") model = convert_model(model, ns, coder="decoder", att_type="encoder_attn") ctrl_enc = ModularCtrl(ns.encoder_embed_dim, ns.encoder_attention_heads, args.encoder_attention_heads_active, hidden_depth=args.ctrl_hidden_depth, hidden_dim=args.ctrl_hidden_dim, ctrl_type=args.ctrl_type) ns.module_ctrl_hidden_depth = args.ctrl_hidden_depth ns.module_ctrl_hidden_dim = args.ctrl_hidden_dim ns.module_ctrl_type = args.ctrl_type for k, v in ctrl_enc.state_dict().items(): model["encoder.module_ctrl.{}".format(k)] = v if not args.share_encoder_ctrl: if args.decoder_attention_heads_active is None: raise ValueError("Missing ``decoder-attention-heads-active'' " "when ``share-encoder-ctrl'' is disabled.") ns.share_encoder_ctrl = False ctrl_dec = ModularCtrl(ns.decoder_embed_dim, ns.decoder_attention_heads, args.decoder_attention_heads_active, hidden_depth=args.ctrl_hidden_depth, hidden_dim=args.ctrl_hidden_dim, ctrl_type=args.ctrl_type) for k, v in ctrl_dec.state_dict().items(): model["decoder.module_ctrl.{}".format(k)] = v else: ns.share_encoder_ctrl = True ns.arch = "transformer_modular" ns.criterion = "label_smoothed_cross_entropy_modular" ns.task = "translation_modular" ns.encoder_attention_heads_active = args.encoder_attention_heads_active state["args"] = ns state["model"] = model for i, _ in enumerate(state["optimizer_history"]): state["optimizer_history"][i][ "criterion_name"] = 'LabelSmoothedCrossEntropyModularCriterion' state = utils.move_to_cpu(state) with PathManager.open(args.save_as, "wb") as f: checkpoint_utils.torch_persistent_save(state, f)