def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path): checkpoint = torch.load(checkpoint_path, map_location="cpu") args = Namespace(**checkpoint["cfg"]["model"]) state_dict = checkpoint["model"] remove_ignore_keys_(state_dict) vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0] state_dict = { key.replace("decoder", "model"): val for key, val in state_dict.items() } config = XGLMConfig( vocab_size=vocab_size, max_position_embeddings=args.max_target_positions, num_layers=args.decoder_layers, attention_heads=args.decoder_attention_heads, ffn_dim=args.decoder_ffn_embed_dim, d_model=args.decoder_embed_dim, layerdrop=args.decoder_layerdrop, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_function="gelu", scale_embedding=not args.no_scale_embedding, tie_word_embeddings=args.share_decoder_input_output_embed, ) model = XGLMForCausalLM(config) missing = model.load_state_dict(state_dict, strict=False) print(missing) model.lm_head = make_linear_from_emb(model.model.embed_tokens) return model
def prepare_config_and_inputs(self): input_ids = np.clip( ids_tensor([self.batch_size, self.seq_length], self.vocab_size), 3, self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]) config = XGLMConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, num_layers=self.num_hidden_layers, attention_heads=self.num_attention_heads, ffn_dim=self.ffn_dim, activation_function=self.activation_function, activation_dropout=self.activation_dropout, attention_dropout=self.attention_dropout, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, use_cache=True, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, ) return (config, input_ids, input_mask)
def get_config( self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False ): return XGLMConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, num_layers=self.num_hidden_layers, attention_heads=self.num_attention_heads, ffn_dim=self.ffn_dim, activation_function=self.activation_function, activation_dropout=self.activation_dropout, attention_dropout=self.attention_dropout, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, use_cache=True, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, gradient_checkpointing=gradient_checkpointing, )
def get_large_model_config(self): return XGLMConfig.from_pretrained("facebook/xglm-564M")