def _bert_lm_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join('~', '.mxnet', 'models'), **kwargs): """BERT based pretrained language model. Returns ------- BERTRNN, gluonnlp.vocab.BERTVocab """ predefined_args = bert_lm_hparams[model_name] mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed', 'rnn_dropout', 'rnn_weight_drop', 'rnn_drop_h', 'rnn_drop_i', 'rnn_drop_e', 'rnn_drop_l'] mutable_args = frozenset(mutable_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) # encoder encoder = BERTMaskedEncoder(attention_cell=predefined_args['attention_cell'], num_layers=predefined_args['num_layers'], units=predefined_args['units'], hidden_size=predefined_args['hidden_size'], max_length=predefined_args['max_length'], num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'], dropout=predefined_args['dropout'], use_residual=predefined_args['use_residual']) # bert_vocab from gluonnlp.vocab.bert import BERTVocab bert_vocab = _load_vocab(bert_vocabs[model_name], vocab, root, cls=BERTVocab) # BERT bert = BERTMaskedModel(encoder, len(bert_vocab), token_type_vocab_size=predefined_args['token_type_vocab_size'], units=predefined_args['units'], embed_size=predefined_args['embed_size'], embed_dropout=predefined_args['embed_dropout'], word_embed=predefined_args['word_embed'], use_pooler=False, use_decoder=False, use_classifier=False) # BERT LM net = BERTRNN(embedding=bert, mode=predefined_args['rnn_mode'], vocab_size=len(bert_vocab), embed_size=predefined_args['rnn_embed_size'], hidden_size=predefined_args['rnn_hidden_size'], hidden_size_last=predefined_args['rnn_hidden_size_last'], num_layers=predefined_args['rnn_num_layers'], tie_weights=predefined_args['rnn_tie_weights'], dropout=predefined_args['rnn_dropout'], weight_drop=predefined_args['rnn_weight_drop'], drop_h=predefined_args['rnn_drop_h'], drop_i=predefined_args['rnn_drop_i'], drop_e=predefined_args['rnn_drop_e'], drop_l=predefined_args['rnn_drop_l'], num_experts=predefined_args['rnn_num_experts'], upperbound_fixed_layer=predefined_args['upperbound_fixed_layer'], **kwargs) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=True) return net, bert_vocab
def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None, tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True, ctx: mx.Context = mx.cpu(), root=os.path.join(get_home_dir(), 'models'), do_lower_case=False, **kwargs): """XLNet model. References: Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V. (2019). XLNet: Generalized Autoregressive Pretraining for Language Understanding. arXiv preprint arXiv:1906.08237. Parameters ---------- dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'. vocab : gluonnlp.vocab.Vocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. tokenizer : XLNetTokenizer or None, default None XLNetTokenizer for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- XLNet, gluonnlp.Vocab, XLNetTokenizer """ kwargs.update(**{ 'hidden_size': 4096, 'units': 1024, 'activation': 'approx_gelu', 'num_heads': 16, 'num_layers': 24, }) if vocab is None or dataset_name is not None: vocab = _load_vocab('xlnet_' + dataset_name, vocab, root) net = XLNet(vocab_size=len(vocab), **kwargs) if pretrained: _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16', dataset_name=dataset_name, root=root, ctx=ctx, ignore_extra=not kwargs.get('use_decoder', True)) if tokenizer is None or dataset_name is not None: tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case) return net, vocab, tokenizer
def get_bort_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), use_decoder=True, output_attention=False, output_all_encodings=False, root=os.path.join(get_home_dir(), 'models'), **kwargs): predefined_args = predefined_borts[model_name] logging.info(f"get_bort_model: {model_name}") mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed'] mutable_args = frozenset(mutable_args) print("model_name: ", model_name, ", predefined_args: ", predefined_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) # encoder encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], num_layers=predefined_args['num_layers'], units=predefined_args['units'], hidden_size=predefined_args['hidden_size'], max_length=predefined_args['max_length'], num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'], dropout=predefined_args['dropout'], output_attention=output_attention, output_all_encodings=output_all_encodings, use_residual=predefined_args['use_residual'], activation=predefined_args.get('activation', 'gelu'), layer_norm_eps=predefined_args.get( 'layer_norm_eps', None)) from gluonnlp.vocab import Vocab bort_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab) net = BortModel(encoder, len(bort_vocab), units=predefined_args['units'], embed_size=predefined_args['embed_size'], embed_dropout=predefined_args['embed_dropout'], word_embed=predefined_args['word_embed'], use_decoder=use_decoder) if pretrained: ignore_extra = not use_decoder _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra, allow_missing=False) return net, bort_vocab
def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): """Any predefined GPT-2 model. Parameters ---------- model_name : str or None, default None Options include 'gpt2_117m' and 'gpt2_345m'. dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. The supported datasets for model_name of either bert_24_1024_16 and bert_12_768_12 are 'openai_webtext'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- GPT2Model, gluonnlp.vocab.Vocab """ predefined_args = gpt2_hparams[model_name] mutable_args = ['dropout'] mutable_args = frozenset(mutable_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) vocab = _load_vocab(dataset_name, vocab, root) # BERT net = GPT2Model(units=predefined_args['units'], vocab_size=len(vocab), max_length=predefined_args['max_length'], num_layers=predefined_args['num_layers'], num_heads=predefined_args['num_heads'], dropout=predefined_args['dropout'], **kwargs) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx) for i in range(net._num_layers): net._ffn_layers[i]._act._support_erf = False return net, vocab
def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): """Any predefined GPT-2 model. Parameters ---------- model_name : str or None, default None Options include 'gpt2_117m' and 'gpt2_345m'. dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. The supported datasets for model_name of either bert_24_1024_16 and bert_12_768_12 are 'openai_webtext'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- GPT2Model, gluonnlp.vocab.Vocab """ predefined_args = gpt2_hparams[model_name].copy() if not hparam_allow_override: mutable_args = ['dropout'] mutable_args = frozenset(mutable_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) vocab = _load_vocab(dataset_name, vocab, root) # GPT2 net = GPT2Model(vocab_size=len(vocab), **predefined_args) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx) return net, vocab
def test_gpt2_transforms(): tokenizer = t.GPT2BPETokenizer() detokenizer = t.GPT2BPEDetokenizer() vocab = _load_vocab('openai_webtext', None, root=os.path.join('tests', 'data', 'models')) s = ' natural language processing tools such as gluonnlp and torchtext' subwords = tokenizer(s) indices = vocab[subwords] gt_gpt2_subword = [u'Ġnatural', u'Ġlanguage', u'Ġprocessing', u'Ġtools', u'Ġsuch', u'Ġas', u'Ġgl', u'u', u'on', u'nl', u'p', u'Ġand', u'Ġtorch', u'text'] gt_gpt2_idx = [3288, 3303, 7587, 4899, 884, 355, 1278, 84, 261, 21283, 79, 290, 28034, 5239] for lhs, rhs in zip(indices, gt_gpt2_idx): assert lhs == rhs for lhs, rhs in zip(subwords, gt_gpt2_subword): assert lhs == rhs recovered_sentence = detokenizer([vocab.idx_to_token[i] for i in indices]) assert recovered_sentence == s
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str): """ Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure- """ # Original Bort configuration bort_4_8_768_1024_hparams = { "attention_cell": "multi_head", "num_layers": 4, "units": 1024, "hidden_size": 768, "max_length": 512, "num_heads": 8, "scaled": True, "dropout": 0.1, "use_residual": True, "embed_size": 1024, "embed_dropout": 0.1, "word_embed": None, "layer_norm_eps": 1e-5, "token_type_vocab_size": 2, } predefined_args = bort_4_8_768_1024_hparams # Let's construct the original Bort model here # Taken from official BERT implementation, see: # https://github.com/alexa/bort/blob/master/bort/bort.py encoder = BERTEncoder( attention_cell=predefined_args["attention_cell"], num_layers=predefined_args["num_layers"], units=predefined_args["units"], hidden_size=predefined_args["hidden_size"], max_length=predefined_args["max_length"], num_heads=predefined_args["num_heads"], scaled=predefined_args["scaled"], dropout=predefined_args["dropout"], output_attention=False, output_all_encodings=False, use_residual=predefined_args["use_residual"], activation=predefined_args.get("activation", "gelu"), layer_norm_eps=predefined_args.get("layer_norm_eps", None), ) # Vocab information needs to be fetched first # It's the same as RoBERTa, so RobertaTokenizer can be used later vocab_name = "openwebtext_ccnews_stories_books_cased" # Specify download folder to Gluonnlp's vocab gluon_cache_dir = os.path.join(get_home_dir(), "models") bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab) original_bort = nlp.model.BERTModel( encoder, len(bort_vocab), units=predefined_args["units"], embed_size=predefined_args["embed_size"], embed_dropout=predefined_args["embed_dropout"], word_embed=predefined_args["word_embed"], use_pooler=False, use_token_type_embed=False, token_type_vocab_size=predefined_args["token_type_vocab_size"], use_classifier=False, use_decoder=False, ) original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True) params = original_bort._collect_params_with_prefix() # Build our config 🤗 hf_bort_config_json = { "architectures": ["BertForMaskedLM"], "attention_probs_dropout_prob": predefined_args["dropout"], "hidden_act": "gelu", "hidden_dropout_prob": predefined_args["dropout"], "hidden_size": predefined_args["embed_size"], "initializer_range": 0.02, "intermediate_size": predefined_args["hidden_size"], "layer_norm_eps": predefined_args["layer_norm_eps"], "max_position_embeddings": predefined_args["max_length"], "model_type": "bort", "num_attention_heads": predefined_args["num_heads"], "num_hidden_layers": predefined_args["num_layers"], "pad_token_id": 1, # 2 = BERT, 1 = RoBERTa "type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa "vocab_size": len(bort_vocab), } hf_bort_config = BertConfig.from_dict(hf_bort_config_json) hf_bort_model = BertForMaskedLM(hf_bort_config) hf_bort_model.eval() # Parameter mapping table (Gluonnlp to Transformers) # * denotes layer index # # | Gluon Parameter | Transformers Parameter # | -------------------------------------------------------------- | ---------------------- # | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias` # | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight` # | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight` # | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight` # | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias` # | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight` # | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias` # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight` # | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias` # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight` # | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias` # | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight` # | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias` # | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight` # | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias` # | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight` # | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias` # | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight` # | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias` # | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight` # Helper function to convert MXNET Arrays to PyTorch def to_torch(mx_array) -> torch.nn.Parameter: return torch.nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) # Check param shapes and map new HF param back def check_and_map_params(hf_param, gluon_param): shape_hf = hf_param.shape gluon_param = to_torch(params[gluon_param]) shape_gluon = gluon_param.shape assert ( shape_hf == shape_gluon ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers" return gluon_param hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params( hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight" ) hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params( hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight" ) hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params( hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta" ) hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params( hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma" ) # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them) hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like( hf_bort_model.bert.embeddings.token_type_embeddings.weight.data ) for i in range(hf_bort_config.num_hidden_layers): layer: BertLayer = hf_bort_model.bert.encoder.layer[i] # self attention self_attn: BertSelfAttention = layer.attention.self self_attn.key.bias.data = check_and_map_params( self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias" ) self_attn.key.weight.data = check_and_map_params( self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight" ) self_attn.query.bias.data = check_and_map_params( self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias" ) self_attn.query.weight.data = check_and_map_params( self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight" ) self_attn.value.bias.data = check_and_map_params( self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias" ) self_attn.value.weight.data = check_and_map_params( self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight" ) # self attention output self_output: BertSelfOutput = layer.attention.output self_output.dense.bias = check_and_map_params( self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias" ) self_output.dense.weight = check_and_map_params( self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight" ) self_output.LayerNorm.bias = check_and_map_params( self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta" ) self_output.LayerNorm.weight = check_and_map_params( self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma" ) # intermediate intermediate: BertIntermediate = layer.intermediate intermediate.dense.bias = check_and_map_params( intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias" ) intermediate.dense.weight = check_and_map_params( intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight" ) # output bert_output: BertOutput = layer.output bert_output.dense.bias = check_and_map_params( bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias" ) bert_output.dense.weight = check_and_map_params( bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight" ) bert_output.LayerNorm.bias = check_and_map_params( bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta" ) bert_output.LayerNorm.weight = check_and_map_params( bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma" ) # Save space and energy 🎄 hf_bort_model.half() # Compare output of both models tokenizer = RobertaTokenizer.from_pretrained("roberta-base") input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"] # Get gluon output gluon_input_ids = mx.nd.array([input_ids]) output_gluon = original_bort(inputs=gluon_input_ids, token_types=[]) # Get Transformer output (save and reload model again) hf_bort_model.save_pretrained(pytorch_dump_folder_path) hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path) hf_bort_model.eval() input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt") output_hf = hf_bort_model(**input_ids)[0] gluon_layer = output_gluon[0].asnumpy() hf_layer = output_hf[0].detach().numpy() max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item() success = np.allclose(gluon_layer, hf_layer, atol=1e-3) if success: print("✔️ Both model do output the same tensors") else: print("❌ Both model do **NOT** output the same tensors") print("Absolute difference is:", max_absolute_diff)