def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    args = Namespace(**checkpoint["cfg"]["model"])
    state_dict = checkpoint["model"]
    remove_ignore_keys_(state_dict)
    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]

    state_dict = {
        key.replace("decoder", "model"): val
        for key, val in state_dict.items()
    }

    config = XGLMConfig(
        vocab_size=vocab_size,
        max_position_embeddings=args.max_target_positions,
        num_layers=args.decoder_layers,
        attention_heads=args.decoder_attention_heads,
        ffn_dim=args.decoder_ffn_embed_dim,
        d_model=args.decoder_embed_dim,
        layerdrop=args.decoder_layerdrop,
        dropout=args.dropout,
        attention_dropout=args.attention_dropout,
        activation_dropout=args.activation_dropout,
        activation_function="gelu",
        scale_embedding=not args.no_scale_embedding,
        tie_word_embeddings=args.share_decoder_input_output_embed,
    )

    model = XGLMForCausalLM(config)
    missing = model.load_state_dict(state_dict, strict=False)
    print(missing)
    model.lm_head = make_linear_from_emb(model.model.embed_tokens)

    return model
Пример #2
0
    def prepare_config_and_inputs(self):
        input_ids = np.clip(
            ids_tensor([self.batch_size, self.seq_length], self.vocab_size), 3,
            self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        config = XGLMConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            num_layers=self.num_hidden_layers,
            attention_heads=self.num_attention_heads,
            ffn_dim=self.ffn_dim,
            activation_function=self.activation_function,
            activation_dropout=self.activation_dropout,
            attention_dropout=self.attention_dropout,
            max_position_embeddings=self.max_position_embeddings,
            initializer_range=self.initializer_range,
            use_cache=True,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
        )

        return (config, input_ids, input_mask)
Пример #3
0
 def get_config(
     self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
 ):
     return XGLMConfig(
         vocab_size=self.vocab_size,
         d_model=self.hidden_size,
         num_layers=self.num_hidden_layers,
         attention_heads=self.num_attention_heads,
         ffn_dim=self.ffn_dim,
         activation_function=self.activation_function,
         activation_dropout=self.activation_dropout,
         attention_dropout=self.attention_dropout,
         max_position_embeddings=self.max_position_embeddings,
         initializer_range=self.initializer_range,
         use_cache=True,
         bos_token_id=self.bos_token_id,
         eos_token_id=self.eos_token_id,
         pad_token_id=self.pad_token_id,
         gradient_checkpointing=gradient_checkpointing,
     )
 def get_large_model_config(self):
     return XGLMConfig.from_pretrained("facebook/xglm-564M")