def test_electra_model(compute_layout): cfg = get_test_cfg() cfg.defrost() cfg.MODEL.compute_layout = compute_layout cfg.freeze() # Generate TN layout cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() # Sample data batch_size = 4 sequence_length = 16 num_mask = 3 inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) valid_length = mx.np.random.randint(3, sequence_length, (batch_size, )) masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) electra_model = ElectraModel.from_cfg(cfg) electra_model.initialize() electra_model.hybridize() contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length) electra_model_tn = ElectraModel.from_cfg(cfg_tn) electra_model_tn.share_parameters(electra_model.collect_params()) electra_model_tn.hybridize() contextual_embedding_tn, pooled_out_tn = electra_model_tn( inputs.T, token_types.T, valid_length) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4, 1E-4) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
def convert_tf_config(config_dict, vocab_size): """Convert the config file""" assert vocab_size == config_dict['vocab_size'] cfg = ElectraModel.get_cfg().clone() cfg.defrost() cfg.MODEL.vocab_size = vocab_size cfg.MODEL.units = config_dict['hidden_size'] cfg.MODEL.embed_size = config_dict['embedding_size'] cfg.MODEL.hidden_size = config_dict['intermediate_size'] cfg.MODEL.max_length = config_dict['max_position_embeddings'] cfg.MODEL.num_heads = config_dict['num_attention_heads'] cfg.MODEL.num_layers = config_dict['num_hidden_layers'] cfg.MODEL.pos_embed_type = 'learned' cfg.MODEL.activation = config_dict['hidden_act'] cfg.MODEL.layer_norm_eps = 1E-12 cfg.MODEL.num_token_types = config_dict['type_vocab_size'] cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob']) cfg.MODEL.attention_dropout_prob = float( config_dict['attention_probs_dropout_prob']) cfg.MODEL.dtype = 'float32' cfg.MODEL.generator_layers_scale = config_dict['generator_layers'] cfg.MODEL.generator_units_scale = config_dict['generator_hidden_size'] cfg.INITIALIZER.weight = [ 'truncnorm', 0, config_dict['initializer_range'] ] # TruncNorm(0, 0.02) cfg.INITIALIZER.bias = ['zeros'] cfg.VERSION = 1 cfg.freeze() return cfg
def get_pretraining_model(model_name, ctx_l, max_seq_length=128, hidden_dropout_prob=0.1, attention_dropout_prob=0.1, generator_units_scale=None, generator_layers_scale=None): """ A Electra Pretrain Model is built with a generator and a discriminator, in which the generator has the same embedding as the discriminator but different backbone. """ cfg, tokenizer, _, _ = get_pretrained_electra(model_name, load_backbone=False) cfg = ElectraModel.get_cfg().clone_merge(cfg) cfg.defrost() cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob cfg.MODEL.attention_dropout_prob = attention_dropout_prob cfg.MODEL.max_length = max_seq_length # Keep the original generator size if not designated if generator_layers_scale: cfg.MODEL.generator_layers_scale = generator_layers_scale if generator_units_scale: cfg.MODEL.generator_units_scale = generator_units_scale cfg.freeze() model = ElectraForPretrain(cfg, uniform_generator=False, tied_generator=False, tied_embeddings=True, disallow_correct=False, weight_initializer=TruncNorm(stdev=0.02)) model.initialize(ctx=ctx_l) model.hybridize() return cfg, tokenizer, model
def test_electra_get_pretrained(model_name, ctx): assert len(list_pretrained_electra()) > 0 with tempfile.TemporaryDirectory() as root, ctx: cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\ get_pretrained_electra(model_name, root=root, load_backbone=True, load_disc=True, load_gen=True) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) electra_model = ElectraModel.from_cfg(cfg) electra_model.load_parameters(backbone_params_path) electra_disc_model = ElectraDiscriminator(cfg) electra_disc_model.load_parameters(disc_params_path) electra_disc_model = ElectraDiscriminator(cfg) electra_disc_model.backbone_model.load_parameters(backbone_params_path) gen_cfg = get_generator_cfg(cfg) electra_gen_model = ElectraGenerator(gen_cfg) electra_gen_model.load_parameters(gen_params_path) electra_gen_model.tie_embeddings( electra_disc_model.backbone_model.word_embed.collect_params(), electra_disc_model.backbone_model.token_type_embed.collect_params( ), electra_disc_model.backbone_model.token_pos_embed.collect_params(), electra_disc_model.backbone_model.embed_layer_norm.collect_params( )) electra_gen_model = ElectraGenerator(cfg) electra_gen_model.backbone_model.load_parameters(backbone_params_path)
def get_test_cfg(): cfg = ElectraModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 100 cfg.MODEL.units = 12 * 8 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.num_layers = 2 cfg.freeze() return cfg
def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, electra_path): ctx = mx.gpu(gpu) if gpu is not None else mx.cpu() if not os.path.exists(save_dir): os.makedirs(save_dir) cfg, vocab_path = convert_tf_assets(model_dir, model_size, electra_path) with open(os.path.join(save_dir, 'model.yml'), 'w') as of: of.write(cfg.dump()) new_vocab = HuggingFaceWordPieceTokenizer(vocab_file=vocab_path, unk_token='[UNK]', pad_token='[PAD]', cls_token='[CLS]', sep_token='[SEP]', mask_token='[MASK]', lowercase=True).vocab new_vocab.save(os.path.join(save_dir, 'vocab.json')) # test input data batch_size = 3 seq_length = 32 num_mask = 5 input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length)) valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size, )) input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \ < np.expand_dims(valid_length, 1) segment_ids = np.random.randint(0, 2, (batch_size, seq_length)) mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask)) tf_input_ids = tf.constant(input_ids, dtype=np.int32) tf_input_mask = tf.constant(input_mask, dtype=np.int32) tf_segment_ids = tf.constant(segment_ids, dtype=np.int32) init_checkpoint = os.path.join(model_dir, 'electra_{}'.format(model_size)) tf_params = read_tf_checkpoint(init_checkpoint) # get parameter names for tensorflow with unused parameters filtered out. tf_names = sorted(tf_params.keys()) tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names) tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names) tf_names = filter(lambda name: name != 'global_step', tf_names) tf_names = filter(lambda name: name != 'generator_predictions/temperature', tf_names) tf_names = list(tf_names) # reload the electra module for this local scope sys.path.append(electra_path) electra_dir = os.path.abspath( os.path.join(os.path.dirname(electra_path), os.path.pardir)) sys.path.append(electra_dir) from electra.util.training_utils import get_bert_config from electra.configure_pretraining import PretrainingConfig from electra.model import modeling config = PretrainingConfig(model_name='', data_dir='', model_size=model_size) bert_config = get_bert_config(config) bert_model = modeling.BertModel(bert_config=bert_config, is_training=False, input_ids=tf_input_ids, input_mask=tf_input_mask, token_type_ids=tf_segment_ids, use_one_hot_embeddings=False, embedding_size=cfg.MODEL.embed_size) tvars = tf.trainable_variables() assignment_map, _ = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # the name of the parameters are ending with ':0' like # 'electra/embeddings/word_embeddings:0' backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars} backbone_params = sess.run(backbone_params) tf_token_outputs_np = { 'pooled_output': sess.run(bert_model.get_pooled_output()), 'sequence_output': sess.run(bert_model.get_sequence_output()), } # The following part only ensure the parameters in backbone model are valid for k in backbone_params: assert_allclose(tf_params[k], backbone_params[k]) # Build gluon model and initialize gluon_model = ElectraModel.from_cfg(cfg) gluon_model.initialize(ctx=ctx) gluon_model.hybridize() gluon_disc_model = ElectraDiscriminator(cfg) gluon_disc_model.initialize(ctx=ctx) gluon_disc_model.hybridize() gen_cfg = get_generator_cfg(cfg) disc_backbone = gluon_disc_model.backbone_model gluon_gen_model = ElectraGenerator(gen_cfg) gluon_gen_model.tie_embeddings( disc_backbone.word_embed.collect_params(), disc_backbone.token_type_embed.collect_params(), disc_backbone.token_pos_embed.collect_params(), disc_backbone.embed_layer_norm.collect_params()) gluon_gen_model.initialize(ctx=ctx) gluon_gen_model.hybridize() # pepare test data mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx) mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx) mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx) mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx) for convert_type in ['backbone', 'disc', 'gen']: name_map = get_name_map(tf_names, convert_type=convert_type) # go through the gluon model to infer the shape of parameters if convert_type == 'backbone': model = gluon_model contextual_embedding, pooled_output = model( mx_input_ids, mx_token_types, mx_valid_length) elif convert_type == 'disc': model = gluon_disc_model contextual_embedding, pooled_output, rtd_scores = \ model(mx_input_ids, mx_token_types, mx_valid_length) elif convert_type == 'gen': model = gluon_gen_model contextual_embedding, pooled_output, mlm_scores = \ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions) # replace tensorflow parameter names with gluon parameter names mx_params = model.collect_params() all_keys = set(mx_params.keys()) for (src_name, dst_name) in name_map.items(): tf_param_val = tf_params[src_name] if dst_name is None: continue all_keys.remove(dst_name) if src_name.endswith('kernel'): mx_params[dst_name].set_data(tf_param_val.T) else: mx_params[dst_name].set_data(tf_param_val) # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight def convert_qkv_weights(tf_prefix, mx_prefix): """ To convert the qkv weights with different prefix. In tensorflow framework, the prefix of query/key/value for the albert model is 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel', and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'. In gluonnlp framework, the prefix is slightly different as 'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and 'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the curly braces {} can be filled with the layer number. """ # Merge query_weight, key_weight, value_weight to mx_params query_weight = tf_params['{}/query/kernel'.format(tf_prefix)] key_weight = tf_params['{}/key/kernel'.format(tf_prefix)] value_weight = tf_params['{}/value/kernel'.format(tf_prefix)] mx_params['{}.attn_qkv.weight'.format(mx_prefix)].set_data( np.concatenate([query_weight, key_weight, value_weight], axis=1).T) # Merge query_bias, key_bias, value_bias to mx_params query_bias = tf_params['{}/query/bias'.format(tf_prefix)] key_bias = tf_params['{}/key/bias'.format(tf_prefix)] value_bias = tf_params['{}/value/bias'.format(tf_prefix)] mx_params['{}.attn_qkv.bias'.format(mx_prefix)].set_data( np.concatenate([query_bias, key_bias, value_bias], axis=0)) # The below parameters of the generator are already initialized in the # discriminator, no need to reload. disc_embed_params = set([ 'backbone_model.embed_layer_norm.beta', 'backbone_model.embed_layer_norm.gamma', 'backbone_model.token_pos_embed._embed.weight', 'backbone_model.token_type_embed.weight', 'mlm_decoder.3.weight', 'backbone_model.word_embed.weight' ]) for key in all_keys: if convert_type == 'gen' and key in disc_embed_params: continue assert re.match( r'^(backbone_model\.){0,1}encoder\.all_encoder_layers\.[\d]+\.attn_qkv\.(weight|bias)$', key) is not None, 'Parameter key {} mismatch'.format(key) tf_prefix = None for layer_id in range(cfg.MODEL.num_layers): mx_prefix = 'encoder.all_encoder_layers.{}'.format(layer_id) if convert_type == 'gen': mx_prefix = 'backbone_model.' + mx_prefix tf_prefix = 'generator/encoder/layer_{}/attention/self'.format( layer_id) elif convert_type == 'disc': mx_prefix = 'backbone_model.' + mx_prefix tf_prefix = 'electra/encoder/layer_{}/attention/self'.format( layer_id) else: tf_prefix = 'electra/encoder/layer_{}/attention/self'.format( layer_id) convert_qkv_weights(tf_prefix, mx_prefix) if convert_type == 'backbone': # test conversion results for backbone model if test_conversion: tf_contextual_embedding = tf_token_outputs_np[ 'sequence_output'] tf_pooled_output = tf_token_outputs_np['pooled_output'] contextual_embedding, pooled_output = model( mx_input_ids, mx_token_types, mx_valid_length) assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-3, 1E-3) for i in range(batch_size): ele_valid_length = valid_length[i] assert_allclose( contextual_embedding[ i, :ele_valid_length, :].asnumpy(), tf_contextual_embedding[i, :ele_valid_length, :], 1E-3, 1E-3) model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True) logging.info('Convert the backbone model in {} to {}/{}'.format( model_dir, save_dir, 'model.params')) elif convert_type == 'disc': model.save_parameters(os.path.join(save_dir, 'disc_model.params'), deduplicate=True) logging.info( 'Convert the discriminator model in {} to {}/{}'.format( model_dir, save_dir, 'disc_model.params')) elif convert_type == 'gen': model.save_parameters(os.path.join(save_dir, 'gen_model.params'), deduplicate=True) logging.info('Convert the generator model in {} to {}/{}'.format( model_dir, save_dir, 'gen_model.params')) logging.info('Conversion finished!') logging.info('Statistics:') old_names = os.listdir(save_dir) for old_name in old_names: new_name, long_hash = naming_convention(save_dir, old_name) old_path = os.path.join(save_dir, old_name) new_path = os.path.join(save_dir, new_name) shutil.move(old_path, new_path) file_size = os.path.getsize(new_path) logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))