def test_bart_cfg(cfg_key): cfg = BartModel.get_cfg(cfg_key) cfg.defrost() cfg.MODEL.vocab_size = 32 cfg.freeze() model = BartModel.from_cfg(cfg) model.initialize() model.hybridize() cfg.defrost() cfg.MODEL.layout = 'TN' cfg.freeze() model_tn = BartModel.from_cfg(cfg) model_tn.share_parameters(model.collect_params()) model_tn.hybridize() mx.npx.waitall()
def convert_params(fairseq_model, gluon_cfg, ctx): fairseq_params = fairseq_model.state_dict() # apply a linear mapping to vocab dictionary gluon_model = BartModel.from_cfg(gluon_cfg, use_pooler=False) gluon_model.initialize(ctx=ctx) gluon_model.hybridize() gluon_params = gluon_model.collect_params() all_keys = set(gluon_params.keys()) def convert_attention(num_layers, fairseq_prefix, gluon_prefix, fairseq_attn_prefix='self_attn', gluon_attn_prefix='attn_qkv'): for layer_id in range(num_layers): fs_atten_prefix = \ '{}.layers.{}.{}.' \ .format(fairseq_prefix, layer_id, fairseq_attn_prefix) fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy() fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy() fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy() fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy() fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy() fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy() gl_qkv_prefix = \ '{}.layers.{}.{}.' \ .format(gluon_prefix, layer_id, gluon_attn_prefix) gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight'] gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias'] all_keys.remove(gl_qkv_prefix + 'weight') all_keys.remove(gl_qkv_prefix + 'bias') gl_qkv_weight.set_data( np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0)) gl_qkv_bias.set_data( np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0)) def convert_ffn(num_layers, fairseq_prefix, gluon_prefix): # convert feed forward layer in encoder for layer_id in range(num_layers): for k, v in [('fc1.weight', 'ffn.ffn_1.weight'), ('fc1.bias', 'ffn.ffn_1.bias'), ('fc2.weight', 'ffn.ffn_2.weight'), ('fc2.bias', 'ffn.ffn_2.bias'), ('final_layer_norm.weight', 'ffn.layer_norm.gamma'), ('final_layer_norm.bias', 'ffn.layer_norm.beta')]: fs_name = '{}.layers.{}.{}' \ .format(fairseq_prefix, layer_id, k) gl_name = '{}.layers.{}.{}' \ .format(gluon_prefix, layer_id, v) all_keys.remove(gl_name) gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()) print('converting embedding params') padding_idx = fairseq_model.task.dictionary.pad_index for fs_name, gl_name in [ ('model.encoder.embed_tokens.weight', 'src_embed_layer.weight'), ('model.encoder.embed_positions.weight', 'src_pos_embed_layer._embed.weight'), ('model.encoder.layernorm_embedding.weight', 'encoder.ln_data.gamma'), ('model.encoder.layernorm_embedding.bias', 'encoder.ln_data.beta'), ('model.decoder.embed_tokens.weight', 'tgt_embed_layer.weight'), ('model.decoder.embed_positions.weight', 'tgt_pos_embed_layer._embed.weight'), ('model.decoder.layernorm_embedding.weight', 'decoder.ln_data.gamma'), ('model.decoder.layernorm_embedding.bias', 'decoder.ln_data.beta'), # final projection in decoder ('model.decoder.output_projection.weight', 'tgt_final_layer.weight'), ]: all_keys.remove(gl_name) if 'embed_positions' in fs_name: # position embed weight gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()[padding_idx + 1:, :]) else: gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()) print('converting encoder params') encoder_num_layers = gluon_cfg.MODEL.ENCODER.num_layers convert_attention(encoder_num_layers, 'model.encoder', 'encoder') convert_ffn(encoder_num_layers, 'model.encoder', 'encoder') for layer_id in range(encoder_num_layers): for k, v in [ ('self_attn.out_proj.weight', 'attention_proj.weight'), ('self_attn.out_proj.bias', 'attention_proj.bias'), ('self_attn_layer_norm.weight', 'layer_norm.gamma'), ('self_attn_layer_norm.bias', 'layer_norm.beta'), ]: fs_name = 'model.encoder.layers.{}.{}' \ .format(layer_id, k) gl_name = 'encoder.layers.{}.{}' \ .format(layer_id, v) all_keys.remove(gl_name) gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()) print('converting decoder params') decoder_num_layers = gluon_cfg.MODEL.DECODER.num_layers convert_attention(decoder_num_layers, 'model.decoder', 'decoder', gluon_attn_prefix='attn_in_qkv') convert_ffn(decoder_num_layers, 'model.decoder', 'decoder') for layer_id in range(decoder_num_layers): for k, v in [ ('self_attn.out_proj.weight', 'proj_in.weight'), ('self_attn.out_proj.bias', 'proj_in.bias'), ('self_attn_layer_norm.weight', 'ln_in.gamma'), ('self_attn_layer_norm.bias', 'ln_in.beta'), ('encoder_attn.out_proj.weight', 'proj_inter.weight'), ('encoder_attn.out_proj.bias', 'proj_inter.bias'), ('encoder_attn_layer_norm.weight', 'ln_inter.gamma'), ('encoder_attn_layer_norm.bias', 'ln_inter.beta'), ('encoder_attn.q_proj.weight', 'attn_inter_q.weight'), ('encoder_attn.q_proj.bias', 'attn_inter_q.bias'), ('encoder_attn.k_proj.weight', 'attn_inter_k.weight'), ('encoder_attn.k_proj.bias', 'attn_inter_k.bias'), ('encoder_attn.v_proj.weight', 'attn_inter_v.weight'), ('encoder_attn.v_proj.bias', 'attn_inter_v.bias'), ]: fs_name = 'model.decoder.layers.{}.{}' \ .format(layer_id, k) gl_name = 'decoder.layers.{}.{}' \ .format(layer_id, v) all_keys.remove(gl_name) gluon_params[gl_name].set_data( fairseq_params[fs_name].cpu().numpy()) assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint' # check parameters sharing if share_decoder_input_output_embed is true assert np.array_equal( fairseq_params['model.decoder.embed_tokens.weight'].cpu().numpy(), fairseq_params['model.decoder.output_projection.weight'].cpu().numpy()) return gluon_model