def test_mobilebert_get_pretrained(model_name): with tempfile.TemporaryDirectory() as root: cfg, tokenizer, backbone_params_path, mlm_params_path =\ get_pretrained_mobilebert(model_name, load_backbone=True, load_mlm=True, root=root) assert cfg.MODEL.vocab_size == len(tokenizer.vocab) mobilebert_model = MobileBertModel.from_cfg(cfg) mobilebert_model.load_parameters(backbone_params_path) mobilebert_pretain_model = MobileBertForPretrain(cfg) if mlm_params_path is not None: mobilebert_pretain_model.load_parameters(mlm_params_path) mobilebert_pretain_model = MobileBertForPretrain(cfg) mobilebert_pretain_model.backbone_model.load_parameters(backbone_params_path)
def test_mobilebert_model_small_cfg(compute_layout, ctx): with ctx: cfg = MobileBertModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 100 cfg.MODEL.num_layers = 2 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.compute_layout = compute_layout cfg.freeze() # Generate TN layout cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() batch_size = 4 sequence_length = 16 num_mask = 3 inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) valid_length = mx.np.random.randint(3, sequence_length, (batch_size, )) masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) mobile_bert_model = MobileBertModel.from_cfg(cfg) mobile_bert_model.initialize() mobile_bert_model.hybridize() mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn) mobile_bert_model_tn.share_parameters( mobile_bert_model.collect_params()) mobile_bert_model_tn.hybridize() contextual_embedding, pooled_out = mobile_bert_model( inputs, token_types, valid_length) contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn( inputs.T, token_types.T, valid_length) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-3, 1E-3) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) # Test for MobileBertForMLM mobile_bert_mlm_model = MobileBertForMLM(cfg) mobile_bert_mlm_model.initialize() mobile_bert_mlm_model.hybridize() mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn) mobile_bert_mlm_model_tn.share_parameters( mobile_bert_mlm_model.collect_params()) mobile_bert_model_tn.hybridize() contextual_embedding, pooled_out, mlm_score = mobile_bert_mlm_model( inputs, token_types, valid_length, masked_positions) contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\ mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-3, 1E-3) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) assert_allclose(mlm_score_tn.asnumpy(), mlm_score.asnumpy(), 1E-3, 1E-3) # Test for MobileBertForPretrain mobile_bert_pretrain_model = MobileBertForPretrain(cfg) mobile_bert_pretrain_model.initialize() mobile_bert_pretrain_model.hybridize() mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn) mobile_bert_pretrain_model_tn.share_parameters( mobile_bert_pretrain_model.collect_params()) mobile_bert_pretrain_model_tn.hybridize() contextual_embedding, pooled_out, nsp_score, mlm_score =\ mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions) contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_score_tn = \ mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-3, 1E-3) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3, 1E-3) assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3, 1E-3) # Test for fp16 if ctx.device_type == 'gpu': pytest.skip('MobileBERT will have nan values in FP16 mode.') verify_backbone_fp16(model_cls=MobileBertModel, cfg=cfg, ctx=ctx, inputs=[inputs, token_types, valid_length])
def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir): ctx = mx.gpu(gpu) if gpu is not None else mx.cpu() if not os.path.exists(save_dir): os.makedirs(save_dir) cfg, json_cfg_path, vocab_path = convert_tf_assets(model_dir) with open(os.path.join(save_dir, 'model.yml'), 'w') as of: of.write(cfg.dump()) new_vocab = HuggingFaceWordPieceTokenizer( vocab_file=vocab_path, unk_token='[UNK]', pad_token='[PAD]', cls_token='[CLS]', sep_token='[SEP]', mask_token='[MASK]', lowercase=True).vocab new_vocab.save(os.path.join(save_dir, 'vocab.json')) # test input data batch_size = 3 seq_length = 32 num_mask = 5 input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length)) valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,)) input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \ < np.expand_dims(valid_length, 1) segment_ids = np.random.randint(0, 2, (batch_size, seq_length)) mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask)) tf_input_ids = tf.constant(input_ids, dtype=np.int32) tf_input_mask = tf.constant(input_mask, dtype=np.int32) tf_segment_ids = tf.constant(segment_ids, dtype=np.int32) init_checkpoint = os.path.join(model_dir, 'mobilebert_variables.ckpt') tf_params = read_tf_checkpoint(init_checkpoint) # get parameter names for tensorflow with unused parameters filtered out. tf_names = sorted(tf_params.keys()) tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names) tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names) tf_names = filter(lambda name: name != 'global_step', tf_names) tf_names = list(tf_names) sys.path.append(mobilebert_dir) from mobilebert import modeling tf_bert_config = modeling.BertConfig.from_json_file(json_cfg_path) bert_model = modeling.BertModel( config=tf_bert_config, is_training=False, input_ids=tf_input_ids, input_mask=tf_input_mask, token_type_ids=tf_segment_ids, use_one_hot_embeddings=False) tvars = tf.trainable_variables() assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # the name of the parameters are ending with ':0' like 'Mobile # Bert/embeddings/word_embeddings:0' backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars} backbone_params = sess.run(backbone_params) tf_token_outputs_np = { 'pooled_output': sess.run(bert_model.get_pooled_output()), 'sequence_output': sess.run(bert_model.get_sequence_output()), } # The following part only ensure the parameters in backbone model are valid for k in backbone_params: assert_allclose(tf_params[k], backbone_params[k]) # Build gluon model and initialize gluon_pretrain_model = MobileBertForPretrain(cfg) gluon_pretrain_model.initialize(ctx=ctx) gluon_pretrain_model.hybridize() # pepare test data mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx) mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx) mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx) mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx) has_mlm = True name_map = get_name_map(tf_names, cfg.MODEL.num_stacked_ffn) # go through the gluon model to infer the shape of parameters model = gluon_pretrain_model contextual_embedding, pooled_output, nsp_score, mlm_scores = \ model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions) # replace tensorflow parameter names with gluon parameter names mx_params = model.collect_params() all_keys = set(mx_params.keys()) for (src_name, dst_name) in name_map.items(): tf_param_val = tf_params[src_name] if dst_name is None: continue all_keys.remove(dst_name) if src_name.endswith('kernel'): mx_params[dst_name].set_data(tf_param_val.T) else: mx_params[dst_name].set_data(tf_param_val) if has_mlm: # 'embedding_table.weight' is shared with word_embed.weight all_keys.remove('embedding_table.weight') assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint' # test conversion results for backbone model if test_conversion: tf_contextual_embedding = tf_token_outputs_np['sequence_output'] tf_pooled_output = tf_token_outputs_np['pooled_output'] contextual_embedding, pooled_output = model.backbone_model( mx_input_ids, mx_token_types, mx_valid_length) assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-2, 1E-2) for i in range(batch_size): ele_valid_length = valid_length[i] assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(), tf_contextual_embedding[i, :ele_valid_length, :], 1E-2, 1E-2) model.backbone_model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True) logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir, save_dir, 'model.params')) model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True) logging.info('Convert the MLM and NSP model in {} to {}/{}'.format(model_dir, save_dir, 'model_mlm.params')) logging.info('Conversion finished!') logging.info('Statistics:') old_names = os.listdir(save_dir) for old_name in old_names: new_name, long_hash = naming_convention(save_dir, old_name) old_path = os.path.join(save_dir, old_name) new_path = os.path.join(save_dir, new_name) shutil.move(old_path, new_path) file_size = os.path.getsize(new_path) logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
def test_mobilebert_model_small_cfg(compute_layout): cfg = MobileBertModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 100 cfg.MODEL.num_layers = 2 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.compute_layout = compute_layout cfg.freeze() # Generate TN layout cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() batch_size = 4 sequence_length = 16 num_mask = 3 inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) valid_length = mx.np.random.randint(3, sequence_length, (batch_size, )) masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) mobile_bert_model = MobileBertModel.from_cfg(cfg) mobile_bert_model.initialize() mobile_bert_model.hybridize() mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn) mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params()) mobile_bert_model_tn.hybridize() contextual_embedding, pooled_out = mobile_bert_model( inputs, token_types, valid_length) contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn( inputs.T, token_types.T, valid_length) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4, 1E-4) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) # Test for MobileBertForMLM mobile_bert_mlm_model = MobileBertForMLM(cfg) mobile_bert_mlm_model.initialize() mobile_bert_mlm_model.hybridize() mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn) mobile_bert_mlm_model_tn.share_parameters( mobile_bert_mlm_model.collect_params()) mobile_bert_model_tn.hybridize() contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model( inputs, token_types, valid_length, masked_positions) contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4, 1E-4) assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) # Test for MobileBertForPretrain mobile_bert_pretrain_model = MobileBertForPretrain(cfg) mobile_bert_pretrain_model.initialize() mobile_bert_pretrain_model.hybridize() mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn) mobile_bert_pretrain_model_tn.share_parameters( mobile_bert_pretrain_model.collect_params()) mobile_bert_pretrain_model_tn.hybridize() contextual_embedding, pooled_out, nsp_score, mlm_scores =\ mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions) contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) assert_allclose(contextual_embedding.asnumpy(), np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4, 1E-4) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)