예제 #1
0
def test_electra_model(compute_layout):
    cfg = get_test_cfg()
    cfg.defrost()
    cfg.MODEL.compute_layout = compute_layout
    cfg.freeze()

    # Generate TN layout
    cfg_tn = cfg.clone()
    cfg_tn.defrost()
    cfg_tn.MODEL.layout = 'TN'
    cfg_tn.freeze()

    # Sample data
    batch_size = 4
    sequence_length = 16
    num_mask = 3
    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
    valid_length = mx.np.random.randint(3, sequence_length, (batch_size, ))
    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))

    electra_model = ElectraModel.from_cfg(cfg)
    electra_model.initialize()
    electra_model.hybridize()
    contextual_embedding, pooled_out = electra_model(inputs, token_types,
                                                     valid_length)
    electra_model_tn = ElectraModel.from_cfg(cfg_tn)
    electra_model_tn.share_parameters(electra_model.collect_params())
    electra_model_tn.hybridize()
    contextual_embedding_tn, pooled_out_tn = electra_model_tn(
        inputs.T, token_types.T, valid_length)
    assert_allclose(contextual_embedding.asnumpy(),
                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), 1E-4,
                    1E-4)
    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
예제 #2
0
def convert_tf_config(config_dict, vocab_size):
    """Convert the config file"""

    assert vocab_size == config_dict['vocab_size']
    cfg = ElectraModel.get_cfg().clone()
    cfg.defrost()
    cfg.MODEL.vocab_size = vocab_size
    cfg.MODEL.units = config_dict['hidden_size']
    cfg.MODEL.embed_size = config_dict['embedding_size']
    cfg.MODEL.hidden_size = config_dict['intermediate_size']
    cfg.MODEL.max_length = config_dict['max_position_embeddings']
    cfg.MODEL.num_heads = config_dict['num_attention_heads']
    cfg.MODEL.num_layers = config_dict['num_hidden_layers']
    cfg.MODEL.pos_embed_type = 'learned'
    cfg.MODEL.activation = config_dict['hidden_act']
    cfg.MODEL.layer_norm_eps = 1E-12
    cfg.MODEL.num_token_types = config_dict['type_vocab_size']
    cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob'])
    cfg.MODEL.attention_dropout_prob = float(
        config_dict['attention_probs_dropout_prob'])
    cfg.MODEL.dtype = 'float32'
    cfg.MODEL.generator_layers_scale = config_dict['generator_layers']
    cfg.MODEL.generator_units_scale = config_dict['generator_hidden_size']
    cfg.INITIALIZER.weight = [
        'truncnorm', 0, config_dict['initializer_range']
    ]  # TruncNorm(0, 0.02)
    cfg.INITIALIZER.bias = ['zeros']
    cfg.VERSION = 1
    cfg.freeze()
    return cfg
예제 #3
0
def get_pretraining_model(model_name,
                          ctx_l,
                          max_seq_length=128,
                          hidden_dropout_prob=0.1,
                          attention_dropout_prob=0.1,
                          generator_units_scale=None,
                          generator_layers_scale=None):
    """
    A Electra Pretrain Model is built with a generator and a discriminator, in which
    the generator has the same embedding as the discriminator but different backbone.
    """
    cfg, tokenizer, _, _ = get_pretrained_electra(model_name,
                                                  load_backbone=False)
    cfg = ElectraModel.get_cfg().clone_merge(cfg)
    cfg.defrost()
    cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
    cfg.MODEL.attention_dropout_prob = attention_dropout_prob
    cfg.MODEL.max_length = max_seq_length
    # Keep the original generator size if not designated
    if generator_layers_scale:
        cfg.MODEL.generator_layers_scale = generator_layers_scale
    if generator_units_scale:
        cfg.MODEL.generator_units_scale = generator_units_scale
    cfg.freeze()

    model = ElectraForPretrain(cfg,
                               uniform_generator=False,
                               tied_generator=False,
                               tied_embeddings=True,
                               disallow_correct=False,
                               weight_initializer=TruncNorm(stdev=0.02))
    model.initialize(ctx=ctx_l)
    model.hybridize()
    return cfg, tokenizer, model
예제 #4
0
def test_electra_get_pretrained(model_name, ctx):
    assert len(list_pretrained_electra()) > 0
    with tempfile.TemporaryDirectory() as root, ctx:
        cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
            get_pretrained_electra(model_name, root=root,
                                   load_backbone=True, load_disc=True, load_gen=True)
        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
        electra_model = ElectraModel.from_cfg(cfg)
        electra_model.load_parameters(backbone_params_path)

        electra_disc_model = ElectraDiscriminator(cfg)
        electra_disc_model.load_parameters(disc_params_path)
        electra_disc_model = ElectraDiscriminator(cfg)
        electra_disc_model.backbone_model.load_parameters(backbone_params_path)

        gen_cfg = get_generator_cfg(cfg)
        electra_gen_model = ElectraGenerator(gen_cfg)
        electra_gen_model.load_parameters(gen_params_path)
        electra_gen_model.tie_embeddings(
            electra_disc_model.backbone_model.word_embed.collect_params(),
            electra_disc_model.backbone_model.token_type_embed.collect_params(
            ),
            electra_disc_model.backbone_model.token_pos_embed.collect_params(),
            electra_disc_model.backbone_model.embed_layer_norm.collect_params(
            ))

        electra_gen_model = ElectraGenerator(cfg)
        electra_gen_model.backbone_model.load_parameters(backbone_params_path)
예제 #5
0
def get_test_cfg():
    cfg = ElectraModel.get_cfg()
    cfg.defrost()
    cfg.MODEL.vocab_size = 100
    cfg.MODEL.units = 12 * 8
    cfg.MODEL.hidden_size = 128
    cfg.MODEL.num_heads = 2
    cfg.MODEL.num_layers = 2
    cfg.freeze()
    return cfg
예제 #6
0
def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu,
                     electra_path):
    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    cfg, vocab_path = convert_tf_assets(model_dir, model_size, electra_path)
    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
        of.write(cfg.dump())
    new_vocab = HuggingFaceWordPieceTokenizer(vocab_file=vocab_path,
                                              unk_token='[UNK]',
                                              pad_token='[PAD]',
                                              cls_token='[CLS]',
                                              sep_token='[SEP]',
                                              mask_token='[MASK]',
                                              lowercase=True).vocab
    new_vocab.save(os.path.join(save_dir, 'vocab.json'))

    # test input data
    batch_size = 3
    seq_length = 32
    num_mask = 5
    input_ids = np.random.randint(0, cfg.MODEL.vocab_size,
                                  (batch_size, seq_length))
    valid_length = np.random.randint(seq_length // 2, seq_length,
                                     (batch_size, ))
    input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
        < np.expand_dims(valid_length, 1)
    segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
    mlm_positions = np.random.randint(0, seq_length // 2,
                                      (batch_size, num_mask))

    tf_input_ids = tf.constant(input_ids, dtype=np.int32)
    tf_input_mask = tf.constant(input_mask, dtype=np.int32)
    tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)

    init_checkpoint = os.path.join(model_dir, 'electra_{}'.format(model_size))
    tf_params = read_tf_checkpoint(init_checkpoint)
    # get parameter names for tensorflow with unused parameters filtered out.
    tf_names = sorted(tf_params.keys())
    tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
    tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
    tf_names = filter(lambda name: name != 'global_step', tf_names)
    tf_names = filter(lambda name: name != 'generator_predictions/temperature',
                      tf_names)
    tf_names = list(tf_names)

    # reload the electra module for this local scope
    sys.path.append(electra_path)
    electra_dir = os.path.abspath(
        os.path.join(os.path.dirname(electra_path), os.path.pardir))
    sys.path.append(electra_dir)
    from electra.util.training_utils import get_bert_config
    from electra.configure_pretraining import PretrainingConfig
    from electra.model import modeling

    config = PretrainingConfig(model_name='',
                               data_dir='',
                               model_size=model_size)
    bert_config = get_bert_config(config)
    bert_model = modeling.BertModel(bert_config=bert_config,
                                    is_training=False,
                                    input_ids=tf_input_ids,
                                    input_mask=tf_input_mask,
                                    token_type_ids=tf_segment_ids,
                                    use_one_hot_embeddings=False,
                                    embedding_size=cfg.MODEL.embed_size)
    tvars = tf.trainable_variables()
    assignment_map, _ = modeling.get_assignment_map_from_checkpoint(
        tvars, init_checkpoint)
    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # the name of the parameters are ending with ':0' like
        # 'electra/embeddings/word_embeddings:0'
        backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
        backbone_params = sess.run(backbone_params)
        tf_token_outputs_np = {
            'pooled_output': sess.run(bert_model.get_pooled_output()),
            'sequence_output': sess.run(bert_model.get_sequence_output()),
        }

    # The following part only ensure the parameters in backbone model are valid
    for k in backbone_params:
        assert_allclose(tf_params[k], backbone_params[k])

    # Build gluon model and initialize
    gluon_model = ElectraModel.from_cfg(cfg)
    gluon_model.initialize(ctx=ctx)
    gluon_model.hybridize()

    gluon_disc_model = ElectraDiscriminator(cfg)
    gluon_disc_model.initialize(ctx=ctx)
    gluon_disc_model.hybridize()

    gen_cfg = get_generator_cfg(cfg)
    disc_backbone = gluon_disc_model.backbone_model
    gluon_gen_model = ElectraGenerator(gen_cfg)
    gluon_gen_model.tie_embeddings(
        disc_backbone.word_embed.collect_params(),
        disc_backbone.token_type_embed.collect_params(),
        disc_backbone.token_pos_embed.collect_params(),
        disc_backbone.embed_layer_norm.collect_params())
    gluon_gen_model.initialize(ctx=ctx)
    gluon_gen_model.hybridize()

    # pepare test data
    mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
    mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
    mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
    mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)

    for convert_type in ['backbone', 'disc', 'gen']:
        name_map = get_name_map(tf_names, convert_type=convert_type)
        # go through the gluon model to infer the shape of parameters

        if convert_type == 'backbone':
            model = gluon_model
            contextual_embedding, pooled_output = model(
                mx_input_ids, mx_token_types, mx_valid_length)
        elif convert_type == 'disc':
            model = gluon_disc_model
            contextual_embedding, pooled_output, rtd_scores = \
                model(mx_input_ids, mx_token_types, mx_valid_length)
        elif convert_type == 'gen':
            model = gluon_gen_model
            contextual_embedding, pooled_output, mlm_scores = \
                model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)

        # replace tensorflow parameter names with gluon parameter names
        mx_params = model.collect_params()
        all_keys = set(mx_params.keys())
        for (src_name, dst_name) in name_map.items():
            tf_param_val = tf_params[src_name]
            if dst_name is None:
                continue
            all_keys.remove(dst_name)
            if src_name.endswith('kernel'):
                mx_params[dst_name].set_data(tf_param_val.T)
            else:
                mx_params[dst_name].set_data(tf_param_val)

        # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight
        def convert_qkv_weights(tf_prefix, mx_prefix):
            """
            To convert the qkv weights with different prefix.

            In tensorflow framework, the prefix of query/key/value for the albert model is
            'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',
            and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'.
            In gluonnlp framework, the prefix is slightly different as
            'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and
            'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the
            curly braces {} can be filled with the layer number.
            """
            # Merge query_weight, key_weight, value_weight to mx_params
            query_weight = tf_params['{}/query/kernel'.format(tf_prefix)]
            key_weight = tf_params['{}/key/kernel'.format(tf_prefix)]
            value_weight = tf_params['{}/value/kernel'.format(tf_prefix)]
            mx_params['{}.attn_qkv.weight'.format(mx_prefix)].set_data(
                np.concatenate([query_weight, key_weight, value_weight],
                               axis=1).T)
            # Merge query_bias, key_bias, value_bias to mx_params
            query_bias = tf_params['{}/query/bias'.format(tf_prefix)]
            key_bias = tf_params['{}/key/bias'.format(tf_prefix)]
            value_bias = tf_params['{}/value/bias'.format(tf_prefix)]
            mx_params['{}.attn_qkv.bias'.format(mx_prefix)].set_data(
                np.concatenate([query_bias, key_bias, value_bias], axis=0))

        # The below parameters of the generator are already initialized in the
        # discriminator, no need to reload.
        disc_embed_params = set([
            'backbone_model.embed_layer_norm.beta',
            'backbone_model.embed_layer_norm.gamma',
            'backbone_model.token_pos_embed._embed.weight',
            'backbone_model.token_type_embed.weight', 'mlm_decoder.3.weight',
            'backbone_model.word_embed.weight'
        ])

        for key in all_keys:
            if convert_type == 'gen' and key in disc_embed_params:
                continue
            assert re.match(
                r'^(backbone_model\.){0,1}encoder\.all_encoder_layers\.[\d]+\.attn_qkv\.(weight|bias)$',
                key) is not None, 'Parameter key {} mismatch'.format(key)

        tf_prefix = None
        for layer_id in range(cfg.MODEL.num_layers):
            mx_prefix = 'encoder.all_encoder_layers.{}'.format(layer_id)
            if convert_type == 'gen':
                mx_prefix = 'backbone_model.' + mx_prefix
                tf_prefix = 'generator/encoder/layer_{}/attention/self'.format(
                    layer_id)
            elif convert_type == 'disc':
                mx_prefix = 'backbone_model.' + mx_prefix
                tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(
                    layer_id)
            else:
                tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(
                    layer_id)

            convert_qkv_weights(tf_prefix, mx_prefix)

        if convert_type == 'backbone':
            # test conversion results for backbone model
            if test_conversion:
                tf_contextual_embedding = tf_token_outputs_np[
                    'sequence_output']
                tf_pooled_output = tf_token_outputs_np['pooled_output']
                contextual_embedding, pooled_output = model(
                    mx_input_ids, mx_token_types, mx_valid_length)
                assert_allclose(pooled_output.asnumpy(), tf_pooled_output,
                                1E-3, 1E-3)
                for i in range(batch_size):
                    ele_valid_length = valid_length[i]
                    assert_allclose(
                        contextual_embedding[
                            i, :ele_valid_length, :].asnumpy(),
                        tf_contextual_embedding[i, :ele_valid_length, :], 1E-3,
                        1E-3)
            model.save_parameters(os.path.join(save_dir, 'model.params'),
                                  deduplicate=True)
            logging.info('Convert the backbone model in {} to {}/{}'.format(
                model_dir, save_dir, 'model.params'))
        elif convert_type == 'disc':
            model.save_parameters(os.path.join(save_dir, 'disc_model.params'),
                                  deduplicate=True)
            logging.info(
                'Convert the discriminator model in {} to {}/{}'.format(
                    model_dir, save_dir, 'disc_model.params'))
        elif convert_type == 'gen':
            model.save_parameters(os.path.join(save_dir, 'gen_model.params'),
                                  deduplicate=True)
            logging.info('Convert the generator model in {} to {}/{}'.format(
                model_dir, save_dir, 'gen_model.params'))

    logging.info('Conversion finished!')
    logging.info('Statistics:')

    old_names = os.listdir(save_dir)
    for old_name in old_names:
        new_name, long_hash = naming_convention(save_dir, old_name)
        old_path = os.path.join(save_dir, old_name)
        new_path = os.path.join(save_dir, new_name)
        shutil.move(old_path, new_path)
        file_size = os.path.getsize(new_path)
        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash,
                                            file_size))