예제 #1
0
def _bert_lm_model(model_name=None, dataset_name=None, vocab=None,
                   pretrained=True, ctx=mx.cpu(),
                   root=os.path.join('~', '.mxnet', 'models'), **kwargs):
    """BERT based pretrained language model.

    Returns
    -------
    BERTRNN, gluonnlp.vocab.BERTVocab
    """
    predefined_args = bert_lm_hparams[model_name]
    mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed',
                    'rnn_dropout', 'rnn_weight_drop', 'rnn_drop_h', 'rnn_drop_i',
                    'rnn_drop_e', 'rnn_drop_l']
    mutable_args = frozenset(mutable_args)
    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
        'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    # encoder
    encoder = BERTMaskedEncoder(attention_cell=predefined_args['attention_cell'],
                                num_layers=predefined_args['num_layers'],
                                units=predefined_args['units'],
                                hidden_size=predefined_args['hidden_size'],
                                max_length=predefined_args['max_length'],
                                num_heads=predefined_args['num_heads'],
                                scaled=predefined_args['scaled'],
                                dropout=predefined_args['dropout'],
                                use_residual=predefined_args['use_residual'])
    # bert_vocab
    from gluonnlp.vocab.bert import BERTVocab
    bert_vocab = _load_vocab(bert_vocabs[model_name], vocab, root, cls=BERTVocab)
    # BERT
    bert = BERTMaskedModel(encoder, len(bert_vocab),
                           token_type_vocab_size=predefined_args['token_type_vocab_size'],
                           units=predefined_args['units'],
                           embed_size=predefined_args['embed_size'],
                           embed_dropout=predefined_args['embed_dropout'],
                           word_embed=predefined_args['word_embed'],
                           use_pooler=False, use_decoder=False,
                           use_classifier=False)

    # BERT LM
    net = BERTRNN(embedding=bert, mode=predefined_args['rnn_mode'], vocab_size=len(bert_vocab),
                  embed_size=predefined_args['rnn_embed_size'],
                  hidden_size=predefined_args['rnn_hidden_size'],
                  hidden_size_last=predefined_args['rnn_hidden_size_last'],
                  num_layers=predefined_args['rnn_num_layers'],
                  tie_weights=predefined_args['rnn_tie_weights'],
                  dropout=predefined_args['rnn_dropout'],
                  weight_drop=predefined_args['rnn_weight_drop'],
                  drop_h=predefined_args['rnn_drop_h'],
                  drop_i=predefined_args['rnn_drop_i'],
                  drop_e=predefined_args['rnn_drop_e'],
                  drop_l=predefined_args['rnn_drop_l'],
                  num_experts=predefined_args['rnn_num_experts'],
                  upperbound_fixed_layer=predefined_args['upperbound_fixed_layer'], **kwargs)

    if pretrained:
        _load_pretrained_params(net, model_name, dataset_name, root, ctx,
                                ignore_extra=True)
    return net, bert_vocab
예제 #2
0
def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
                              tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
                              ctx: mx.Context = mx.cpu(),
                              root=os.path.join(get_home_dir(), 'models'),
                              do_lower_case=False, **kwargs):
    """XLNet model.

    References:
    Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
    (2019). XLNet: Generalized Autoregressive Pretraining for Language
    Understanding. arXiv preprint arXiv:1906.08237.


    Parameters
    ----------
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
    vocab : gluonnlp.vocab.Vocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    tokenizer : XLNetTokenizer or None, default None
        XLNetTokenizer for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    XLNet, gluonnlp.Vocab, XLNetTokenizer

    """
    kwargs.update(**{
        'hidden_size': 4096,
        'units': 1024,
        'activation': 'approx_gelu',
        'num_heads': 16,
        'num_layers': 24,
    })
    if vocab is None or dataset_name is not None:
        vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
    net = XLNet(vocab_size=len(vocab), **kwargs)
    if pretrained:
        _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16',
                                dataset_name=dataset_name, root=root, ctx=ctx,
                                ignore_extra=not kwargs.get('use_decoder', True))
    if tokenizer is None or dataset_name is not None:
        tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
    return net, vocab, tokenizer
예제 #3
0
파일: bort.py 프로젝트: zolekode/bort
def get_bort_model(model_name=None,
                   dataset_name=None,
                   vocab=None,
                   pretrained=True,
                   ctx=mx.cpu(),
                   use_decoder=True,
                   output_attention=False,
                   output_all_encodings=False,
                   root=os.path.join(get_home_dir(), 'models'),
                   **kwargs):
    predefined_args = predefined_borts[model_name]
    logging.info(f"get_bort_model: {model_name}")
    mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed']
    mutable_args = frozenset(mutable_args)
    print("model_name: ", model_name, ", predefined_args: ", predefined_args)
    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
        'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    # encoder
    encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
                          num_layers=predefined_args['num_layers'],
                          units=predefined_args['units'],
                          hidden_size=predefined_args['hidden_size'],
                          max_length=predefined_args['max_length'],
                          num_heads=predefined_args['num_heads'],
                          scaled=predefined_args['scaled'],
                          dropout=predefined_args['dropout'],
                          output_attention=output_attention,
                          output_all_encodings=output_all_encodings,
                          use_residual=predefined_args['use_residual'],
                          activation=predefined_args.get('activation', 'gelu'),
                          layer_norm_eps=predefined_args.get(
                              'layer_norm_eps', None))

    from gluonnlp.vocab import Vocab
    bort_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab)

    net = BortModel(encoder,
                    len(bort_vocab),
                    units=predefined_args['units'],
                    embed_size=predefined_args['embed_size'],
                    embed_dropout=predefined_args['embed_dropout'],
                    word_embed=predefined_args['word_embed'],
                    use_decoder=use_decoder)
    if pretrained:
        ignore_extra = not use_decoder
        _load_pretrained_params(net,
                                model_name,
                                dataset_name,
                                root,
                                ctx,
                                ignore_extra=ignore_extra,
                                allow_missing=False)
    return net, bort_vocab
def _get_gpt2_model(model_name=None,
                    dataset_name=None,
                    vocab=None,
                    pretrained=True,
                    ctx=mx.cpu(),
                    root=os.path.join(get_home_dir(), 'models'),
                    **kwargs):
    """Any predefined GPT-2 model.

    Parameters
    ----------
    model_name : str or None, default None
        Options include 'gpt2_117m' and 'gpt2_345m'.
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        The supported datasets for model_name of either bert_24_1024_16 and
        bert_12_768_12 are 'openai_webtext'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    GPT2Model, gluonnlp.vocab.Vocab
    """
    predefined_args = gpt2_hparams[model_name]
    mutable_args = ['dropout']
    mutable_args = frozenset(mutable_args)
    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
        'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    vocab = _load_vocab(dataset_name, vocab, root)
    # BERT
    net = GPT2Model(units=predefined_args['units'],
                    vocab_size=len(vocab),
                    max_length=predefined_args['max_length'],
                    num_layers=predefined_args['num_layers'],
                    num_heads=predefined_args['num_heads'],
                    dropout=predefined_args['dropout'],
                    **kwargs)
    if pretrained:
        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
    for i in range(net._num_layers):
        net._ffn_layers[i]._act._support_erf = False
    return net, vocab
예제 #5
0
def _get_gpt2_model(model_name=None,
                    dataset_name=None,
                    vocab=None,
                    pretrained=True,
                    ctx=mx.cpu(),
                    root=os.path.join(get_home_dir(), 'models'),
                    hparam_allow_override=False,
                    **kwargs):
    """Any predefined GPT-2 model.

    Parameters
    ----------
    model_name : str or None, default None
        Options include 'gpt2_117m' and 'gpt2_345m'.
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        The supported datasets for model_name of either bert_24_1024_16 and
        bert_12_768_12 are 'openai_webtext'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.
    hparam_allow_override : bool, default False
        If set to True, pre-defined hyper-parameters of the model
        (e.g. the number of layers, hidden units) can be overriden.

    Returns
    -------
    GPT2Model, gluonnlp.vocab.Vocab
    """
    predefined_args = gpt2_hparams[model_name].copy()
    if not hparam_allow_override:
        mutable_args = ['dropout']
        mutable_args = frozenset(mutable_args)
        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
            'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    vocab = _load_vocab(dataset_name, vocab, root)
    # GPT2
    net = GPT2Model(vocab_size=len(vocab), **predefined_args)
    if pretrained:
        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
    return net, vocab
예제 #6
0
def test_gpt2_transforms():
    tokenizer = t.GPT2BPETokenizer()
    detokenizer = t.GPT2BPEDetokenizer()
    vocab = _load_vocab('openai_webtext', None, root=os.path.join('tests', 'data', 'models'))
    s = ' natural language processing tools such as gluonnlp and torchtext'
    subwords = tokenizer(s)
    indices = vocab[subwords]
    gt_gpt2_subword = [u'Ġnatural', u'Ġlanguage', u'Ġprocessing', u'Ġtools',
                       u'Ġsuch', u'Ġas', u'Ġgl', u'u', u'on',
                       u'nl', u'p', u'Ġand', u'Ġtorch', u'text']
    gt_gpt2_idx = [3288, 3303, 7587, 4899, 884, 355, 1278, 84, 261, 21283, 79, 290, 28034, 5239]
    for lhs, rhs in zip(indices, gt_gpt2_idx):
        assert lhs == rhs
    for lhs, rhs in zip(subwords, gt_gpt2_subword):
        assert lhs == rhs

    recovered_sentence = detokenizer([vocab.idx_to_token[i] for i in indices])
    assert recovered_sentence == s
예제 #7
0
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
    """
    Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
    """

    # Original Bort configuration
    bort_4_8_768_1024_hparams = {
        "attention_cell": "multi_head",
        "num_layers": 4,
        "units": 1024,
        "hidden_size": 768,
        "max_length": 512,
        "num_heads": 8,
        "scaled": True,
        "dropout": 0.1,
        "use_residual": True,
        "embed_size": 1024,
        "embed_dropout": 0.1,
        "word_embed": None,
        "layer_norm_eps": 1e-5,
        "token_type_vocab_size": 2,
    }

    predefined_args = bort_4_8_768_1024_hparams

    # Let's construct the original Bort model here
    # Taken from official BERT implementation, see:
    # https://github.com/alexa/bort/blob/master/bort/bort.py
    encoder = BERTEncoder(
        attention_cell=predefined_args["attention_cell"],
        num_layers=predefined_args["num_layers"],
        units=predefined_args["units"],
        hidden_size=predefined_args["hidden_size"],
        max_length=predefined_args["max_length"],
        num_heads=predefined_args["num_heads"],
        scaled=predefined_args["scaled"],
        dropout=predefined_args["dropout"],
        output_attention=False,
        output_all_encodings=False,
        use_residual=predefined_args["use_residual"],
        activation=predefined_args.get("activation", "gelu"),
        layer_norm_eps=predefined_args.get("layer_norm_eps", None),
    )

    # Vocab information needs to be fetched first
    # It's the same as RoBERTa, so RobertaTokenizer can be used later
    vocab_name = "openwebtext_ccnews_stories_books_cased"

    # Specify download folder to Gluonnlp's vocab
    gluon_cache_dir = os.path.join(get_home_dir(), "models")
    bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)

    original_bort = nlp.model.BERTModel(
        encoder,
        len(bort_vocab),
        units=predefined_args["units"],
        embed_size=predefined_args["embed_size"],
        embed_dropout=predefined_args["embed_dropout"],
        word_embed=predefined_args["word_embed"],
        use_pooler=False,
        use_token_type_embed=False,
        token_type_vocab_size=predefined_args["token_type_vocab_size"],
        use_classifier=False,
        use_decoder=False,
    )

    original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
    params = original_bort._collect_params_with_prefix()

    # Build our config 🤗
    hf_bort_config_json = {
        "architectures": ["BertForMaskedLM"],
        "attention_probs_dropout_prob": predefined_args["dropout"],
        "hidden_act": "gelu",
        "hidden_dropout_prob": predefined_args["dropout"],
        "hidden_size": predefined_args["embed_size"],
        "initializer_range": 0.02,
        "intermediate_size": predefined_args["hidden_size"],
        "layer_norm_eps": predefined_args["layer_norm_eps"],
        "max_position_embeddings": predefined_args["max_length"],
        "model_type": "bort",
        "num_attention_heads": predefined_args["num_heads"],
        "num_hidden_layers": predefined_args["num_layers"],
        "pad_token_id": 1,  # 2 = BERT, 1 = RoBERTa
        "type_vocab_size": 1,  # 2 = BERT, 1 = RoBERTa
        "vocab_size": len(bort_vocab),
    }

    hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
    hf_bort_model = BertForMaskedLM(hf_bort_config)
    hf_bort_model.eval()

    # Parameter mapping table (Gluonnlp to Transformers)
    # * denotes layer index
    #
    # | Gluon Parameter                                                | Transformers Parameter
    # | -------------------------------------------------------------- | ----------------------
    # | `encoder.layer_norm.beta`                                      | `bert.embeddings.LayerNorm.bias`
    # | `encoder.layer_norm.gamma`                                     | `bert.embeddings.LayerNorm.weight`
    # | `encoder.position_weight`                                      | `bert.embeddings.position_embeddings.weight`
    # | `word_embed.0.weight`                                          | `bert.embeddings.word_embeddings.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_key.bias`     | `bert.encoder.layer.*.attention.self.key.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_key.weight`   | `bert.encoder.layer.*.attention.self.key.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_query.bias`   | `bert.encoder.layer.*.attention.self.query.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_value.bias`   | `bert.encoder.layer.*.attention.self.value.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
    # | `encoder.transformer_cells.*.ffn.ffn_2.bias`                   | `bert.encoder.layer.*.attention.output.dense.bias`
    # | `encoder.transformer_cells.*.ffn.ffn_2.weight`                 | `bert.encoder.layer.*.attention.output.dense.weight`
    # | `encoder.transformer_cells.*.layer_norm.beta`                  | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
    # | `encoder.transformer_cells.*.layer_norm.gamma`                 | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
    # | `encoder.transformer_cells.*.ffn.ffn_1.bias`                   | `bert.encoder.layer.*.intermediate.dense.bias`
    # | `encoder.transformer_cells.*.ffn.ffn_1.weight`                 | `bert.encoder.layer.*.intermediate.dense.weight`
    # | `encoder.transformer_cells.*.ffn.layer_norm.beta`              | `bert.encoder.layer.*.output.LayerNorm.bias`
    # | `encoder.transformer_cells.*.ffn.layer_norm.gamma`             | `bert.encoder.layer.*.output.LayerNorm.weight`
    # | `encoder.transformer_cells.*.proj.bias`                        | `bert.encoder.layer.*.output.dense.bias`
    # | `encoder.transformer_cells.*.proj.weight`                      | `bert.encoder.layer.*.output.dense.weight`

    # Helper function to convert MXNET Arrays to PyTorch
    def to_torch(mx_array) -> torch.nn.Parameter:
        return torch.nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))

    # Check param shapes and map new HF param back
    def check_and_map_params(hf_param, gluon_param):
        shape_hf = hf_param.shape

        gluon_param = to_torch(params[gluon_param])
        shape_gluon = gluon_param.shape

        assert (
            shape_hf == shape_gluon
        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"

        return gluon_param

    hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
    )
    hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
    )
    hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
    )
    hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
    )

    # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
    hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
    )

    for i in range(hf_bort_config.num_hidden_layers):
        layer: BertLayer = hf_bort_model.bert.encoder.layer[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self

        self_attn.key.bias.data = check_and_map_params(
            self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
        )

        self_attn.key.weight.data = check_and_map_params(
            self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
        )
        self_attn.query.bias.data = check_and_map_params(
            self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
        )
        self_attn.query.weight.data = check_and_map_params(
            self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
        )
        self_attn.value.bias.data = check_and_map_params(
            self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
        )
        self_attn.value.weight.data = check_and_map_params(
            self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
        )

        # self attention output
        self_output: BertSelfOutput = layer.attention.output

        self_output.dense.bias = check_and_map_params(
            self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
        )
        self_output.dense.weight = check_and_map_params(
            self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
        )
        self_output.LayerNorm.bias = check_and_map_params(
            self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
        )
        self_output.LayerNorm.weight = check_and_map_params(
            self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
        )

        # intermediate
        intermediate: BertIntermediate = layer.intermediate

        intermediate.dense.bias = check_and_map_params(
            intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
        )
        intermediate.dense.weight = check_and_map_params(
            intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
        )

        # output
        bert_output: BertOutput = layer.output

        bert_output.dense.bias = check_and_map_params(
            bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
        )
        bert_output.dense.weight = check_and_map_params(
            bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
        )
        bert_output.LayerNorm.bias = check_and_map_params(
            bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
        )
        bert_output.LayerNorm.weight = check_and_map_params(
            bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
        )

    # Save space and energy 🎄
    hf_bort_model.half()

    # Compare output of both models
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]

    # Get gluon output
    gluon_input_ids = mx.nd.array([input_ids])
    output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])

    # Get Transformer output (save and reload model again)
    hf_bort_model.save_pretrained(pytorch_dump_folder_path)
    hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
    hf_bort_model.eval()

    input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
    output_hf = hf_bort_model(**input_ids)[0]

    gluon_layer = output_gluon[0].asnumpy()
    hf_layer = output_hf[0].detach().numpy()

    max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
    success = np.allclose(gluon_layer, hf_layer, atol=1e-3)

    if success:
        print("✔️ Both model do output the same tensors")
    else:
        print("❌ Both model do **NOT** output the same tensors")
        print("Absolute difference is:", max_absolute_diff)