def convert_t5(args): logging.info('converting T5 model from Huggingface...') if not os.path.exists(args.dest_dir): os.mkdir(args.dest_dir) converted = {} # convert and save vocab convert_vocab(args, converted) # convert and save config gluon_cfg = Gluon_T5.get_cfg(args.tgt_model_name) gluon_cfg = convert_config(args, gluon_cfg, converted) # convert, (test), and save model hf_t5 = HF_T5.from_pretrained(args.model_name) gluon_t5 = Gluon_T5.from_cfg(gluon_cfg) gluon_t5 = convert_params(args, converted, hf_t5, gluon_t5) gluon_t5.hybridize() # test model if needed if args.test: test_conversion(args, hf_t5, gluon_t5) # rename with sha1sum rename(args, converted) logging.info('conversion completed.') logging.info('file statistics:') for item, new_path in converted.items(): logging.info('filename: {}\tsize: {}\tsha1sum: {}'.format( os.path.basename(new_path), os.path.getsize(new_path), sha1sum(new_path))) return converted
def test_t5_model(cfg_key, activation, ctx): with ctx: cfg = T5Model.get_cfg(cfg_key) cfg.defrost() cfg.MODEL.vocab_size = 256 cfg.MODEL.d_model = 128 cfg.MODEL.d_ff = 512 cfg.MODEL.num_layers = 2 cfg.MODEL.num_heads = 4 cfg.MODEL.activation = activation cfg.MODEL.layout = 'NT' cfg.freeze() cfg_tn = cfg.clone() cfg_tn.defrost() cfg_tn.MODEL.layout = 'TN' cfg_tn.freeze() # test TN and NT consistency t5_model = T5Model.from_cfg(cfg) t5_model.initialize() t5_model.hybridize() t5_model_tn = T5Model.from_cfg(cfg_tn) t5_model_tn.share_parameters(t5_model.collect_params()) t5_model_tn.hybridize() batch_size = 8 src_length = 32 tgt_length = 18 src_data = np.random.randint(0, 255, (batch_size, src_length)) src_valid_length = np.random.randint(src_length // 2, src_length, (batch_size, )) tgt_data = np.random.randint(0, 255, (batch_size, tgt_length)) tgt_valid_length = np.random.randint(tgt_length // 4, tgt_length, (batch_size, )) out = t5_model(src_data, src_valid_length, tgt_data, tgt_valid_length) out_tn = t5_model_tn(src_data.T, src_valid_length, tgt_data.T, tgt_valid_length) assert np.allclose(np.swapaxes(out, 0, 1), out_tn, 1E-5, 1E-5) # test consistency with various target valid length for shift in range(1, np.min(tgt_valid_length).item()): for partial_out in [ t5_model(src_data, src_valid_length, tgt_data[:, :-shift], tgt_valid_length - shift), t5_model(src_data, src_valid_length, tgt_data, tgt_valid_length - shift) ]: for i in range(batch_size): vl = tgt_valid_length[i].item() - shift assert np.allclose(partial_out[i, :vl], out[i, :vl], 1E-5, 1E-5)
def convert_config(args, converted): print('converting cfg...') # download config gluon_cfg = Gluon_T5.get_cfg(T5_PRETRAINED_MODEL_MAP[args.model_name]) with tempfile.TemporaryDirectory() as temp_dir: hf_cfg_path = os.path.join(temp_dir, 'config.json') download(url=T5_PRETRAINED_CONFIG_MAP[args.model_name], path=hf_cfg_path) with open(hf_cfg_path, 'r') as f: hf_cfg = json.load(f) os.remove(hf_cfg_path) # update attributes cfg = gluon_cfg.clone() cfg.defrost() cfg.MODEL.vocab_size = hf_cfg['vocab_size'] cfg.MODEL.d_model = hf_cfg['d_model'] cfg.MODEL.d_kv = hf_cfg['d_kv'] cfg.MODEL.d_ff = hf_cfg['d_ff'] cfg.MODEL.num_layers = hf_cfg['num_layers'] cfg.MODEL.num_heads = hf_cfg['num_heads'] cfg.MODEL.layer_norm_eps = hf_cfg['layer_norm_epsilon'] cfg.MODEL.dropout_prob = hf_cfg['dropout_rate'] cfg.INITIALIZER.init_factor = hf_cfg['initializer_factor'] cfg.freeze() # save config config_path = os.path.join(args.dest_dir, 'model.yml') with open(config_path, 'w') as f: f.write(cfg.dump()) converted['config'] = config_path return cfg
def test_t5_get_pretrained(ctx): with tempfile.TemporaryDirectory() as root, ctx: cfg, tokenizer, backbone_params_path, _ = get_pretrained_t5( 'google_t5_small') assert cfg.MODEL.vocab_size >= len(tokenizer._sp_model) t5_model = T5Model.from_cfg(cfg) t5_model.load_parameters(backbone_params_path) t5_model.hybridize() t5_inference_model = T5Inference(t5_model) t5_inference_model.hybridize()
def test_t5_inference(layout, activation, ctx): with ctx: cfg = T5Model.get_cfg('google_t5_small') cfg.defrost() cfg.MODEL.layout = layout cfg.MODEL.activation = activation cfg.freeze() model = T5Model.from_cfg(cfg) model.initialize() model.hybridize() # while keeping T5Model implementation consistent with Huggingface's, this # temporary class would help the backbone fit into the provided nmt tests. class TempWithHead(HybridBlock): def __init__(self, model): super().__init__() self.model = model self.layout = model.layout self.src_vocab_size = model.vocab_size self.tgt_vocab_size = model.vocab_size # append a final output layer self.output_layer = nn.Dense(units=model.vocab_size, in_units=model._d_model, flatten=False, use_bias=False, dtype=model._dtype) self.output_layer.weight = model.input_embedding_layer.weight def forward(self, *args, **kwargs): return self.output_layer(self.model(*args, **kwargs)) backbone = TempWithHead(model) backbone.hybridize() verify_nmt_model(backbone) inference_model = T5Inference(model) inference_model.hybridize() verify_nmt_inference(train_model=backbone, inference_model=inference_model)